(refactor) caching use LLMCachingHandler for async_get_cache and set_cache (#6208)

* use folder for caching * fix importing caching * fix clickhouse pyright * fix linting * fix correctly pass kwargs and args * fix test case for embedding * fix linting * fix embedding caching logic * fix refactor handle utils.py * fix test_embedding_caching_azure_individual_items_reordered
2024-10-14 16:34:01 +05:30 · 2024-10-14 16:34:01 +05:30 · 4d1b4beb3d
commit 4d1b4beb3d
parent 20e50d7002
96 changed files with 690 additions and 489 deletions
--- a/cookbook/litellm_router_load_test/test_loadtest_openai_client.py
+++ b/cookbook/litellm_router_load_test/test_loadtest_openai_client.py
@ -10,7 +10,7 @@ sys.path.insert(
 import asyncio
 from litellm import Router, Timeout
 import time
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 import litellm
 import openai
--- a/cookbook/litellm_router_load_test/test_loadtest_router_withs3_cache.py
+++ b/cookbook/litellm_router_load_test/test_loadtest_router_withs3_cache.py
@ -10,7 +10,7 @@ sys.path.insert(
 import asyncio
 from litellm import Router, Timeout
 import time
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 import litellm
 litellm.cache = Cache(
--- a/docs/my-website/docs/caching/all_caches.md
+++ b/docs/my-website/docs/caching/all_caches.md
@ -3,7 +3,7 @@ import TabItem from '@theme/TabItem';
 # Caching - In-Memory, Redis, s3, Redis Semantic Cache, Disk
-[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py)
+[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm.caching.caching.py)
 :::info
@ -31,7 +31,7 @@ For the hosted version you can setup your own Redis DB here: https://app.redisla
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 litellm.cache = Cache(type="redis", host=<host>, port=<port>, password=<password>)
@ -68,7 +68,7 @@ AWS_SECRET_ACCESS_KEY = "WOl*****"
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 # pass s3-bucket name
 litellm.cache = Cache(type="s3", s3_bucket_name="cache-bucket-litellm", s3_region_name="us-west-2")
@ -101,7 +101,7 @@ For the hosted version you can setup your own Redis DB here: https://app.redisla
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 random_number = random.randint(
    1, 100000
@ -155,7 +155,7 @@ To set up a Qdrant cluster locally follow: https://qdrant.tech/documentation/qui
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 random_number = random.randint(
    1, 100000
@ -210,7 +210,7 @@ assert response1.id == response2.id
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 litellm.cache = Cache()
 # Make completion calls
@ -246,7 +246,7 @@ Then you can use the disk cache as follows.
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 litellm.cache = Cache(type="disk")
 # Make completion calls
@ -422,7 +422,7 @@ def custom_get_cache_key(*args, **kwargs):
 Set your function as litellm.cache.get_cache_key
 ```python
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 cache = Cache(type="redis", host=os.environ['REDIS_HOST'], port=os.environ['REDIS_PORT'], password=os.environ['REDIS_PASSWORD'])
@ -434,7 +434,7 @@ litellm.cache = cache # set litellm.cache to your cache
 ## How to write custom add/get cache functions 
 ### 1. Init Cache 
 ```python
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 cache = Cache()
 ``` 
--- a/docs/my-website/docs/caching/caching_api.md
+++ b/docs/my-website/docs/caching/caching_api.md
@ -6,7 +6,7 @@ Use api.litellm.ai for caching `completion()` and `embedding()` responses
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 litellm.cache = Cache(type="hosted") # init cache to use api.litellm.ai
 # Make completion calls
@ -31,7 +31,7 @@ response2 = completion(
 import time
 import litellm
 from litellm import completion, embedding
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 litellm.cache = Cache(type="hosted")
 start_time = time.time()
@ -53,7 +53,7 @@ LiteLLM can cache your streamed responses for you
 import litellm
 import time
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 litellm.cache = Cache(type="hosted")
--- a/docs/my-website/docs/caching/local_caching.md
+++ b/docs/my-website/docs/caching/local_caching.md
@ -13,7 +13,7 @@ Keys in the cache are `model`, the following example will lead to a cache hit
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 litellm.cache = Cache()
 # Make completion calls
@ -35,7 +35,7 @@ response2 = completion(
 Add custom key-value pairs to your cache. 
 ```python 
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 cache = Cache()
 cache.add_cache(cache_key="test-key", result="1234")
@ -50,7 +50,7 @@ LiteLLM can cache your streamed responses for you
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 litellm.cache = Cache()
 # Make completion calls
@ -77,7 +77,7 @@ Keys in the cache are `model`, the following example will lead to a cache hit
 import time
 import litellm
 from litellm import embedding
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 litellm.cache = Cache()
 start_time = time.time()
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@ -49,13 +49,13 @@ litellm_settings:
  cache: true 
  cache_params:        # set cache params for redis
    type: redis
-    namespace: "litellm_caching"
+    namespace: "litellm.caching.caching"
 ```
 and keys will be stored like:
 ```
-litellm_caching:<hash>
+litellm.caching.caching:<hash>
 ```
 #### Redis Cluster 
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -645,7 +645,7 @@ litellm_settings:
    host: "localhost"  # The host address for the Redis cache. Required if type is "redis".
    port: 6379  # The port number for the Redis cache. Required if type is "redis".
    password: "your_password"  # The password for the Redis cache. Required if type is "redis".
-    namespace: "litellm_caching" # namespace for redis cache
+    namespace: "litellm.caching.caching" # namespace for redis cache
    # Optional - Redis Cluster Settings
    redis_startup_nodes: [{"host": "127.0.0.1", "port": "7001"}] 
--- a/docs/my-website/docs/proxy/guardrails/custom_guardrail.md
+++ b/docs/my-website/docs/proxy/guardrails/custom_guardrail.md
@ -25,7 +25,7 @@ from typing import Any, Dict, List, Literal, Optional, Union
 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.guardrails.guardrail_helpers import should_proceed_based_on_metadata
--- a/enterprise/enterprise_callbacks/generic_api_callback.py
+++ b/enterprise/enterprise_callbacks/generic_api_callback.py
@ -6,7 +6,7 @@ import dotenv, os
 import requests
 from litellm.proxy._types import UserAPIKeyAuth
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from typing import Literal, Union, Optional
--- a/enterprise/enterprise_hooks/aporia_ai.py
+++ b/enterprise/enterprise_hooks/aporia_ai.py
@ -13,7 +13,7 @@ sys.path.insert(
 )  # Adds the parent directory to the system path
 from typing import Optional, Literal, Union, Any
 import litellm, traceback, sys, uuid
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from fastapi import HTTPException
--- a/enterprise/enterprise_hooks/banned_keywords.py
+++ b/enterprise/enterprise_hooks/banned_keywords.py
@ -9,7 +9,7 @@
 from typing import Optional, Literal
 import litellm
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.integrations.custom_logger import CustomLogger
 from litellm._logging import verbose_proxy_logger
--- a/enterprise/enterprise_hooks/blocked_user_list.py
+++ b/enterprise/enterprise_hooks/blocked_user_list.py
@ -10,7 +10,7 @@
 from typing import Optional, Literal
 import litellm
 from litellm.proxy.utils import PrismaClient
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth, LiteLLM_EndUserTable
 from litellm.integrations.custom_logger import CustomLogger
 from litellm._logging import verbose_proxy_logger
--- a/enterprise/enterprise_hooks/google_text_moderation.py
+++ b/enterprise/enterprise_hooks/google_text_moderation.py
@ -9,7 +9,7 @@
 from typing import Optional, Literal, Union
 import litellm, traceback, sys, uuid
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.integrations.custom_logger import CustomLogger
 from fastapi import HTTPException
--- a/enterprise/enterprise_hooks/llama_guard.py
+++ b/enterprise/enterprise_hooks/llama_guard.py
@ -15,7 +15,7 @@ sys.path.insert(
 )  # Adds the parent directory to the system path
 from typing import Optional, Literal, Union
 import litellm, traceback, sys, uuid
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.integrations.custom_logger import CustomLogger
 from fastapi import HTTPException
--- a/enterprise/enterprise_hooks/llm_guard.py
+++ b/enterprise/enterprise_hooks/llm_guard.py
@ -13,7 +13,7 @@ import traceback
 import sys
 import uuid
 import os
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.integrations.custom_logger import CustomLogger
 from fastapi import HTTPException
--- a/enterprise/enterprise_hooks/openai_moderation.py
+++ b/enterprise/enterprise_hooks/openai_moderation.py
@ -12,7 +12,7 @@ sys.path.insert(
 )  # Adds the parent directory to the system path
 from typing import Optional, Literal, Union
 import litellm, traceback, sys, uuid
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.integrations.custom_logger import CustomLogger
 from fastapi import HTTPException
--- a/enterprise/enterprise_hooks/secret_detection.py
+++ b/enterprise/enterprise_hooks/secret_detection.py
@ -12,7 +12,7 @@ sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 from typing import Optional
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm._logging import verbose_proxy_logger
 import tempfile
--- a/litellm/init.py
+++ b/litellm/init.py
@ -7,7 +7,7 @@ import threading
 import os
 from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 from litellm._logging import (
    set_verbose,
    _turn_on_debug,
--- a/litellm/batch_completion/main.py
+++ b/litellm/batch_completion/main.py
@ -2,7 +2,6 @@ from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
 from typing import List, Optional
 import litellm
 from litellm import completion
 from litellm._logging import print_verbose
 from litellm.utils import get_optional_params
@ -108,7 +107,7 @@ def batch_completion(
                    if "kwargs" in kwargs_modified:
                        original_kwargs = kwargs_modified.pop("kwargs")
                    future = executor.submit(
-                        completion, **kwargs_modified, **original_kwargs
+                        litellm.completion, **kwargs_modified, **original_kwargs
                    )
                    completions.append(future)
@ -156,7 +155,7 @@ def batch_completion_models(*args, **kwargs):
        with ThreadPoolExecutor(max_workers=len(models)) as executor:
            for model in models:
                futures[model] = executor.submit(
-                    completion, *args, model=model, **kwargs
+                    litellm.completion, *args, model=model, **kwargs
                )
            for model, future in sorted(
@ -178,7 +177,9 @@ def batch_completion_models(*args, **kwargs):
                    ):  # don't override deployment values e.g. model name, api base, etc.
                        deployment[key] = kwargs[key]
                kwargs = {**deployment, **nested_kwargs}
-                futures[deployment["model"]] = executor.submit(completion, **kwargs)
+                futures[deployment["model"]] = executor.submit(
                    litellm.completion, **kwargs
                )
            while futures:
                # wait for the first returned future
@ -246,7 +247,7 @@ def batch_completion_models_all_responses(*args, **kwargs):
    with concurrent.futures.ThreadPoolExecutor(max_workers=len(models)) as executor:
        for idx, model in enumerate(models):
-            future = executor.submit(completion, *args, model=model, **kwargs)
+            future = executor.submit(litellm.completion, *args, model=model, **kwargs)
            if future.result() is not None:
                responses.append(future.result())
--- a/litellm/caching/caching.py
+++ b/litellm/caching/caching.py
@ -212,7 +212,7 @@ class RedisCache(BaseCache):
        from litellm._service_logger import ServiceLogging
-        from ._redis import get_redis_client, get_redis_connection_pool
+        from .._redis import get_redis_client, get_redis_connection_pool
        redis_kwargs = {}
        if host is not None:
@ -276,7 +276,7 @@ class RedisCache(BaseCache):
            )
    def init_async_client(self):
-        from ._redis import get_redis_async_client
+        from .._redis import get_redis_async_client
        return get_redis_async_client(
            connection_pool=self.async_redis_conn_pool, **self.redis_kwargs
@ -302,7 +302,7 @@ class RedisCache(BaseCache):
        except Exception as e:
            # NON blocking - notify users Redis is throwing an exception
            print_verbose(
-                f"LiteLLM Caching: set() - Got exception from REDIS : {str(e)}"
+                f"litellm.caching.caching: set() - Got exception from REDIS : {str(e)}"
            )
    def increment_cache(
@ -705,7 +705,7 @@ class RedisCache(BaseCache):
        except Exception as e:
            # NON blocking - notify users Redis is throwing an exception
            verbose_logger.error(
-                "LiteLLM Caching: get() - Got exception from REDIS: ", e
+                "litellm.caching.caching: get() - Got exception from REDIS: ", e
            )
    def batch_get_cache(self, key_list) -> dict:
@ -781,7 +781,7 @@ class RedisCache(BaseCache):
                )
                # NON blocking - notify users Redis is throwing an exception
                print_verbose(
-                    f"LiteLLM Caching: async get() - Got exception from REDIS: {str(e)}"
+                    f"litellm.caching.caching: async get() - Got exception from REDIS: {str(e)}"
                )
    async def async_batch_get_cache(self, key_list) -> dict:
--- a/litellm/caching/caching_handler.py
+++ b/litellm/caching/caching_handler.py
@ -0,0 +1,440 @@
 """
 This contains LLMCachingHandler 
 This exposes two methods:
    - async_get_cache
    - async_set_cache
 This file is a wrapper around caching.py
 In each method it will call the appropriate method from caching.py
 """
 import asyncio
 import datetime
 import threading
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple
 from pydantic import BaseModel
 import litellm
 from litellm._logging import print_verbose
 from litellm.caching.caching import (
    Cache,
    QdrantSemanticCache,
    RedisCache,
    RedisSemanticCache,
    S3Cache,
 )
 from litellm.types.rerank import RerankResponse
 from litellm.types.utils import (
    CallTypes,
    Embedding,
    EmbeddingResponse,
    ModelResponse,
    TextCompletionResponse,
    TranscriptionResponse,
 )
 if TYPE_CHECKING:
    from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 else:
    LiteLLMLoggingObj = Any
 class CachingHandlerResponse(BaseModel):
    """
    This is the response object for the caching handler. We need to separate embedding cached responses and (completion / text_completion / transcription) cached responses
    For embeddings there can be a cache hit for some of the inputs in the list and a cache miss for others
    """
    cached_result: Optional[Any] = None
    final_embedding_cached_response: Optional[EmbeddingResponse] = None
    embedding_all_elements_cache_hit: bool = (
        False  # this is set to True when all elements in the list have a cache hit in the embedding cache, if true return the final_embedding_cached_response no need to make an API call
    )
 class LLMCachingHandler:
    def __init__(self):
        pass
    async def _async_get_cache(
        self,
        model: str,
        original_function: Callable,
        logging_obj: LiteLLMLoggingObj,
        start_time: datetime.datetime,
        call_type: str,
        kwargs: Dict[str, Any],
        args: Optional[Tuple[Any, ...]] = None,
    ) -> CachingHandlerResponse:
        """
        Internal method to get from the cache.
        Handles different call types (embeddings, chat/completions, text_completion, transcription)
        and accordingly returns the cached response
        Args:
            model: str:
            original_function: Callable:
            logging_obj: LiteLLMLoggingObj:
            start_time: datetime.datetime:
            call_type: str:
            kwargs: Dict[str, Any]:
            args: Optional[Tuple[Any, ...]] = None:
        Returns:
            CachingHandlerResponse:
        Raises:
            None
        """
        from litellm.utils import (
            CustomStreamWrapper,
            convert_to_model_response_object,
            convert_to_streaming_response_async,
        )
        args = args or ()
        final_embedding_cached_response: Optional[EmbeddingResponse] = None
        cached_result: Optional[Any] = None
        if (
            (kwargs.get("caching", None) is None and litellm.cache is not None)
            or kwargs.get("caching", False) is True
        ) and (
            kwargs.get("cache", {}).get("no-cache", False) is not True
        ):  # allow users to control returning cached responses from the completion function
            # checking cache
            print_verbose("INSIDE CHECKING CACHE")
            if (
                litellm.cache is not None
                and litellm.cache.supported_call_types is not None
                and str(original_function.__name__)
                in litellm.cache.supported_call_types
            ):
                print_verbose("Checking Cache")
                if call_type == CallTypes.aembedding.value and isinstance(
                    kwargs["input"], list
                ):
                    tasks = []
                    for idx, i in enumerate(kwargs["input"]):
                        preset_cache_key = litellm.cache.get_cache_key(
                            *args, **{**kwargs, "input": i}
                        )
                        tasks.append(
                            litellm.cache.async_get_cache(cache_key=preset_cache_key)
                        )
                    cached_result = await asyncio.gather(*tasks)
                    ## check if cached result is None ##
                    if cached_result is not None and isinstance(cached_result, list):
                        # set cached_result to None if all elements are None
                        if all(result is None for result in cached_result):
                            cached_result = None
                elif isinstance(litellm.cache.cache, RedisSemanticCache) or isinstance(
                    litellm.cache.cache, RedisCache
                ):
                    preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
                    kwargs["preset_cache_key"] = (
                        preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
                    )
                    cached_result = await litellm.cache.async_get_cache(*args, **kwargs)
                elif isinstance(litellm.cache.cache, QdrantSemanticCache):
                    preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
                    kwargs["preset_cache_key"] = (
                        preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
                    )
                    cached_result = await litellm.cache.async_get_cache(*args, **kwargs)
                else:  # for s3 caching. [NOT RECOMMENDED IN PROD - this will slow down responses since boto3 is sync]
                    preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
                    kwargs["preset_cache_key"] = (
                        preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
                    )
                    cached_result = litellm.cache.get_cache(*args, **kwargs)
                if cached_result is not None and not isinstance(cached_result, list):
                    print_verbose("Cache Hit!")
                    cache_hit = True
                    end_time = datetime.datetime.now()
                    (
                        model,
                        custom_llm_provider,
                        dynamic_api_key,
                        api_base,
                    ) = litellm.get_llm_provider(
                        model=model,
                        custom_llm_provider=kwargs.get("custom_llm_provider", None),
                        api_base=kwargs.get("api_base", None),
                        api_key=kwargs.get("api_key", None),
                    )
                    print_verbose(
                        f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
                    )
                    logging_obj.update_environment_variables(
                        model=model,
                        user=kwargs.get("user", None),
                        optional_params={},
                        litellm_params={
                            "logger_fn": kwargs.get("logger_fn", None),
                            "acompletion": True,
                            "metadata": kwargs.get("metadata", {}),
                            "model_info": kwargs.get("model_info", {}),
                            "proxy_server_request": kwargs.get(
                                "proxy_server_request", None
                            ),
                            "preset_cache_key": kwargs.get("preset_cache_key", None),
                            "stream_response": kwargs.get("stream_response", {}),
                            "api_base": kwargs.get("api_base", ""),
                        },
                        input=kwargs.get("messages", ""),
                        api_key=kwargs.get("api_key", None),
                        original_response=str(cached_result),
                        additional_args=None,
                        stream=kwargs.get("stream", False),
                    )
                    call_type = original_function.__name__
                    if call_type == CallTypes.acompletion.value and isinstance(
                        cached_result, dict
                    ):
                        if kwargs.get("stream", False) is True:
                            cached_result = convert_to_streaming_response_async(
                                response_object=cached_result,
                            )
                            cached_result = CustomStreamWrapper(
                                completion_stream=cached_result,
                                model=model,
                                custom_llm_provider="cached_response",
                                logging_obj=logging_obj,
                            )
                        else:
                            cached_result = convert_to_model_response_object(
                                response_object=cached_result,
                                model_response_object=ModelResponse(),
                            )
                    if call_type == CallTypes.atext_completion.value and isinstance(
                        cached_result, dict
                    ):
                        if kwargs.get("stream", False) is True:
                            cached_result = convert_to_streaming_response_async(
                                response_object=cached_result,
                            )
                            cached_result = CustomStreamWrapper(
                                completion_stream=cached_result,
                                model=model,
                                custom_llm_provider="cached_response",
                                logging_obj=logging_obj,
                            )
                        else:
                            cached_result = TextCompletionResponse(**cached_result)
                    elif call_type == CallTypes.aembedding.value and isinstance(
                        cached_result, dict
                    ):
                        cached_result = convert_to_model_response_object(
                            response_object=cached_result,
                            model_response_object=EmbeddingResponse(),
                            response_type="embedding",
                        )
                    elif call_type == CallTypes.arerank.value and isinstance(
                        cached_result, dict
                    ):
                        cached_result = convert_to_model_response_object(
                            response_object=cached_result,
                            model_response_object=None,
                            response_type="rerank",
                        )
                    elif call_type == CallTypes.atranscription.value and isinstance(
                        cached_result, dict
                    ):
                        hidden_params = {
                            "model": "whisper-1",
                            "custom_llm_provider": custom_llm_provider,
                            "cache_hit": True,
                        }
                        cached_result = convert_to_model_response_object(
                            response_object=cached_result,
                            model_response_object=TranscriptionResponse(),
                            response_type="audio_transcription",
                            hidden_params=hidden_params,
                        )
                    if kwargs.get("stream", False) is False:
                        # LOG SUCCESS
                        asyncio.create_task(
                            logging_obj.async_success_handler(
                                cached_result, start_time, end_time, cache_hit
                            )
                        )
                        threading.Thread(
                            target=logging_obj.success_handler,
                            args=(cached_result, start_time, end_time, cache_hit),
                        ).start()
                    cache_key = kwargs.get("preset_cache_key", None)
                    if (
                        isinstance(cached_result, BaseModel)
                        or isinstance(cached_result, CustomStreamWrapper)
                    ) and hasattr(cached_result, "_hidden_params"):
                        cached_result._hidden_params["cache_key"] = cache_key  # type: ignore
                    return CachingHandlerResponse(cached_result=cached_result)
                elif (
                    call_type == CallTypes.aembedding.value
                    and cached_result is not None
                    and isinstance(cached_result, list)
                    and litellm.cache is not None
                    and not isinstance(
                        litellm.cache.cache, S3Cache
                    )  # s3 doesn't support bulk writing. Exclude.
                ):
                    remaining_list = []
                    non_null_list = []
                    for idx, cr in enumerate(cached_result):
                        if cr is None:
                            remaining_list.append(kwargs["input"][idx])
                        else:
                            non_null_list.append((idx, cr))
                    original_kwargs_input = kwargs["input"]
                    kwargs["input"] = remaining_list
                    if len(non_null_list) > 0:
                        print_verbose(f"EMBEDDING CACHE HIT! - {len(non_null_list)}")
                        final_embedding_cached_response = EmbeddingResponse(
                            model=kwargs.get("model"),
                            data=[None] * len(original_kwargs_input),
                        )
                        final_embedding_cached_response._hidden_params["cache_hit"] = (
                            True
                        )
                        for val in non_null_list:
                            idx, cr = val  # (idx, cr) tuple
                            if cr is not None:
                                final_embedding_cached_response.data[idx] = Embedding(
                                    embedding=cr["embedding"],
                                    index=idx,
                                    object="embedding",
                                )
                    if len(remaining_list) == 0:
                        # LOG SUCCESS
                        cache_hit = True
                        end_time = datetime.datetime.now()
                        (
                            model,
                            custom_llm_provider,
                            dynamic_api_key,
                            api_base,
                        ) = litellm.get_llm_provider(
                            model=model,
                            custom_llm_provider=kwargs.get("custom_llm_provider", None),
                            api_base=kwargs.get("api_base", None),
                            api_key=kwargs.get("api_key", None),
                        )
                        print_verbose(
                            f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
                        )
                        logging_obj.update_environment_variables(
                            model=model,
                            user=kwargs.get("user", None),
                            optional_params={},
                            litellm_params={
                                "logger_fn": kwargs.get("logger_fn", None),
                                "acompletion": True,
                                "metadata": kwargs.get("metadata", {}),
                                "model_info": kwargs.get("model_info", {}),
                                "proxy_server_request": kwargs.get(
                                    "proxy_server_request", None
                                ),
                                "preset_cache_key": kwargs.get(
                                    "preset_cache_key", None
                                ),
                                "stream_response": kwargs.get("stream_response", {}),
                                "api_base": "",
                            },
                            input=kwargs.get("messages", ""),
                            api_key=kwargs.get("api_key", None),
                            original_response=str(final_embedding_cached_response),
                            additional_args=None,
                            stream=kwargs.get("stream", False),
                        )
                        asyncio.create_task(
                            logging_obj.async_success_handler(
                                final_embedding_cached_response,
                                start_time,
                                end_time,
                                cache_hit,
                            )
                        )
                        threading.Thread(
                            target=logging_obj.success_handler,
                            args=(
                                final_embedding_cached_response,
                                start_time,
                                end_time,
                                cache_hit,
                            ),
                        ).start()
                        return CachingHandlerResponse(
                            final_embedding_cached_response=final_embedding_cached_response,
                            embedding_all_elements_cache_hit=True,
                        )
        return CachingHandlerResponse(
            cached_result=cached_result,
            final_embedding_cached_response=final_embedding_cached_response,
        )
    async def _async_set_cache(
        self,
        result: Any,
        original_function: Callable,
        kwargs: Dict[str, Any],
        args: Optional[Tuple[Any, ...]] = None,
    ):
        """
        Internal method to check the type of the result & cache used and adds the result to the cache accordingly
        Args:
            result: Any:
            original_function: Callable:
            kwargs: Dict[str, Any]:
            args: Optional[Tuple[Any, ...]] = None:
        Returns:
            None
        Raises:
            None
        """
        args = args or ()
        # [OPTIONAL] ADD TO CACHE
        if (
            (litellm.cache is not None)
            and litellm.cache.supported_call_types is not None
            and (str(original_function.__name__) in litellm.cache.supported_call_types)
            and (kwargs.get("cache", {}).get("no-store", False) is not True)
        ):
            if (
                isinstance(result, litellm.ModelResponse)
                or isinstance(result, litellm.EmbeddingResponse)
                or isinstance(result, TranscriptionResponse)
                or isinstance(result, RerankResponse)
            ):
                if (
                    isinstance(result, EmbeddingResponse)
                    and isinstance(kwargs["input"], list)
                    and litellm.cache is not None
                    and not isinstance(
                        litellm.cache.cache, S3Cache
                    )  # s3 doesn't support bulk writing. Exclude.
                ):
                    asyncio.create_task(
                        litellm.cache.async_add_cache_pipeline(result, *args, **kwargs)
                    )
                elif isinstance(litellm.cache.cache, S3Cache):
                    threading.Thread(
                        target=litellm.cache.add_cache,
                        args=(result,) + args,
                        kwargs=kwargs,
                    ).start()
                else:
                    asyncio.create_task(
                        litellm.cache.async_add_cache(result.json(), *args, **kwargs)
                    )
            else:
                asyncio.create_task(
                    litellm.cache.async_add_cache(result, *args, **kwargs)
                )
--- a/litellm/deprecated_litellm_server/server_utils.py
+++ b/litellm/deprecated_litellm_server/server_utils.py
@ -43,7 +43,7 @@
 #     ### REDIS
 #     # if len(os.getenv("REDIS_HOST", "")) >  0 and len(os.getenv("REDIS_PORT", "")) > 0 and len(os.getenv("REDIS_PASSWORD", "")) > 0:
 #     #     print(f"redis host: {os.getenv('REDIS_HOST')}; redis port: {os.getenv('REDIS_PORT')}; password: {os.getenv('REDIS_PASSWORD')}")
-#     #     from litellm.caching import Cache
+#     #     from litellm.caching.caching import Cache
 #     #     litellm.cache = Cache(type="redis", host=os.getenv("REDIS_HOST"), port=os.getenv("REDIS_PORT"), password=os.getenv("REDIS_PASSWORD"))
 #     #     print("\033[92mLiteLLM: Switched on Redis caching\033[0m")
--- a/litellm/integrations/SlackAlerting/slack_alerting.py
+++ b/litellm/integrations/SlackAlerting/slack_alerting.py
@ -21,7 +21,7 @@ import litellm.litellm_core_utils
 import litellm.litellm_core_utils.litellm_logging
 import litellm.types
 from litellm._logging import verbose_logger, verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_batch_logger import CustomBatchLogger
 from litellm.litellm_core_utils.exception_mapping_utils import (
    _add_key_name_and_team_to_alert,
--- a/litellm/integrations/clickhouse.py
+++ b/litellm/integrations/clickhouse.py
@ -13,7 +13,7 @@ import requests
 import litellm
 from litellm._logging import verbose_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.types.utils import StandardLoggingPayload
@ -29,14 +29,30 @@ def create_client():
        clickhouse_host = os.getenv("CLICKHOUSE_HOST")
        if clickhouse_host is not None:
            verbose_logger.debug("setting up clickhouse")
            port = os.getenv("CLICKHOUSE_PORT")
            if port is not None and isinstance(port, str):
                port = int(port)
            host: Optional[str] = os.getenv("CLICKHOUSE_HOST")
            if host is None:
                raise ValueError("CLICKHOUSE_HOST is not set")
            username: Optional[str] = os.getenv("CLICKHOUSE_USERNAME")
            if username is None:
                raise ValueError("CLICKHOUSE_USERNAME is not set")
            password: Optional[str] = os.getenv("CLICKHOUSE_PASSWORD")
            if password is None:
                raise ValueError("CLICKHOUSE_PASSWORD is not set")
            if port is None:
                raise ValueError("CLICKHOUSE_PORT is not set")
            client = clickhouse_connect.get_client(
-                host=os.getenv("CLICKHOUSE_HOST"),
+                host=host,
                port=port,
-                username=os.getenv("CLICKHOUSE_USERNAME"),
+                username=username,
-                password=os.getenv("CLICKHOUSE_PASSWORD"),
+                password=password,
            )
            return client
        else:
@ -176,11 +192,29 @@ def _start_clickhouse():
        if port is not None and isinstance(port, str):
            port = int(port)
        port = os.getenv("CLICKHOUSE_PORT")
        if port is not None and isinstance(port, str):
            port = int(port)
        host: Optional[str] = os.getenv("CLICKHOUSE_HOST")
        if host is None:
            raise ValueError("CLICKHOUSE_HOST is not set")
        username: Optional[str] = os.getenv("CLICKHOUSE_USERNAME")
        if username is None:
            raise ValueError("CLICKHOUSE_USERNAME is not set")
        password: Optional[str] = os.getenv("CLICKHOUSE_PASSWORD")
        if password is None:
            raise ValueError("CLICKHOUSE_PASSWORD is not set")
        if port is None:
            raise ValueError("CLICKHOUSE_PORT is not set")
        client = clickhouse_connect.get_client(
-            host=os.getenv("CLICKHOUSE_HOST"),
+            host=host,
            port=port,
-            username=os.getenv("CLICKHOUSE_USERNAME"),
+            username=username,
-            password=os.getenv("CLICKHOUSE_PASSWORD"),
+            password=password,
        )
        # view all tables in DB
        response = client.query("SHOW TABLES")
@ -241,11 +275,25 @@ class ClickhouseLogger:
        if port is not None and isinstance(port, str):
            port = int(port)
        host: Optional[str] = os.getenv("CLICKHOUSE_HOST")
        if host is None:
            raise ValueError("CLICKHOUSE_HOST is not set")
        username: Optional[str] = os.getenv("CLICKHOUSE_USERNAME")
        if username is None:
            raise ValueError("CLICKHOUSE_USERNAME is not set")
        password: Optional[str] = os.getenv("CLICKHOUSE_PASSWORD")
        if password is None:
            raise ValueError("CLICKHOUSE_PASSWORD is not set")
        if port is None:
            raise ValueError("CLICKHOUSE_PORT is not set")
        client = clickhouse_connect.get_client(
-            host=os.getenv("CLICKHOUSE_HOST"),
+            host=host,
            port=port,
-            username=os.getenv("CLICKHOUSE_USERNAME"),
+            username=username,
-            password=os.getenv("CLICKHOUSE_PASSWORD"),
+            password=password,
        )
        self.client = client
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -8,7 +8,7 @@ from typing import Any, Literal, Optional, Tuple, Union
 import dotenv
 from pydantic import BaseModel
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.types.llms.openai import ChatCompletionRequest
 from litellm.types.services import ServiceLoggerPayload
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -23,7 +23,7 @@ from litellm import (
    turn_off_message_logging,
    verbose_logger,
 )
-from litellm.caching import DualCache, InMemoryCache, S3Cache
+from litellm.caching.caching import DualCache, InMemoryCache, S3Cache
 from litellm.cost_calculator import _select_model_name_for_cost_calc
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.integrations.custom_logger import CustomLogger
--- a/litellm/llms/AzureOpenAI/azure.py
+++ b/litellm/llms/AzureOpenAI/azure.py
@ -10,7 +10,7 @@ from openai import AsyncAzureOpenAI, AzureOpenAI
 from typing_extensions import overload
 import litellm
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.types.utils import EmbeddingResponse
--- a/litellm/llms/base_aws_llm.py
+++ b/litellm/llms/base_aws_llm.py
@ -6,7 +6,7 @@ from typing import Dict, List, Optional, Tuple
 import httpx
 from litellm._logging import verbose_logger
-from litellm.caching import DualCache, InMemoryCache
+from litellm.caching.caching import DualCache, InMemoryCache
 from litellm.secret_managers.main import get_secret
 from .base import BaseLLM
--- a/litellm/llms/bedrock/chat/invoke_handler.py
+++ b/litellm/llms/bedrock/chat/invoke_handler.py
@ -29,7 +29,7 @@ import requests  # type: ignore
 import litellm
 from litellm import verbose_logger
-from litellm.caching import InMemoryCache
+from litellm.caching.caching import InMemoryCache
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.litellm_core_utils.litellm_logging import Logging
 from litellm.llms.custom_httpx.http_handler import (
--- a/litellm/llms/prompt_templates/image_handling.py
+++ b/litellm/llms/prompt_templates/image_handling.py
@ -8,7 +8,7 @@ from httpx import Response
 import litellm
 from litellm import verbose_logger
-from litellm.caching import InMemoryCache
+from litellm.caching.caching import InMemoryCache
 from litellm.llms.custom_httpx.http_handler import (
    _get_httpx_client,
    get_async_httpx_client,
--- a/litellm/llms/vertex_ai_and_google_ai_studio/context_caching/vertex_ai_context_caching.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/context_caching/vertex_ai_context_caching.py
@ -4,7 +4,7 @@ from typing import Callable, List, Literal, Optional, Tuple, Union
 import httpx
 import litellm
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 from litellm.litellm_core_utils.litellm_logging import Logging
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.llms.OpenAI.openai import AllMessageValues
--- a/litellm/main.py
+++ b/litellm/main.py
@ -65,7 +65,7 @@ from litellm.utils import (
 )
 from ._logging import verbose_logger
-from .caching import disable_cache, enable_cache, update_cache
+from .caching.caching import disable_cache, enable_cache, update_cache
 from .llms import (
    aleph_alpha,
    baseten,
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@ -16,7 +16,7 @@ from pydantic import BaseModel
 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import (
    LiteLLM_EndUserTable,
    LiteLLM_JWTAuth,
--- a/litellm/proxy/auth/handle_jwt.py
+++ b/litellm/proxy/auth/handle_jwt.py
@ -15,7 +15,7 @@ from cryptography.hazmat.backends import default_backend
 from cryptography.hazmat.primitives import serialization
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
 from litellm.proxy._types import LiteLLM_JWTAuth, LiteLLM_UserTable
 from litellm.proxy.utils import PrismaClient
--- a/litellm/proxy/caching_routes.py
+++ b/litellm/proxy/caching_routes.py
@ -5,7 +5,7 @@ from fastapi import APIRouter, Depends, HTTPException, Request
 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import RedisCache
+from litellm.caching.caching import RedisCache
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 router = APIRouter(
--- a/litellm/proxy/example_config_yaml/custom_guardrail.py
+++ b/litellm/proxy/example_config_yaml/custom_guardrail.py
@ -2,7 +2,7 @@ from typing import Any, Dict, List, Literal, Optional, Union
 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.guardrails.guardrail_helpers import should_proceed_based_on_metadata
--- a/litellm/proxy/guardrails/guardrail_hooks/aporia_ai.py
+++ b/litellm/proxy/guardrails/guardrail_hooks/aporia_ai.py
@ -25,7 +25,7 @@ from fastapi import HTTPException
 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.litellm_core_utils.logging_utils import (
    convert_litellm_response_object_to_str,
--- a/litellm/proxy/guardrails/guardrail_hooks/bedrock_guardrails.py
+++ b/litellm/proxy/guardrails/guardrail_hooks/bedrock_guardrails.py
@ -25,7 +25,7 @@ from fastapi import HTTPException
 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.litellm_core_utils.logging_utils import (
    convert_litellm_response_object_to_str,
--- a/litellm/proxy/guardrails/guardrail_hooks/custom_guardrail.py
+++ b/litellm/proxy/guardrails/guardrail_hooks/custom_guardrail.py
@ -2,7 +2,7 @@ from typing import Any, Dict, List, Literal, Optional, Union
 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.guardrails.guardrail_helpers import should_proceed_based_on_metadata
--- a/litellm/proxy/guardrails/guardrail_hooks/presidio.py
+++ b/litellm/proxy/guardrails/guardrail_hooks/presidio.py
@ -21,7 +21,7 @@ from pydantic import BaseModel
 import litellm  # noqa: E401
 from litellm import get_secret
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.utils import (
--- a/litellm/proxy/health_endpoints/_health_endpoints.py
+++ b/litellm/proxy/health_endpoints/_health_endpoints.py
@ -465,7 +465,7 @@ async def health_readiness():
        # check Cache
        cache_type = None
        if litellm.cache is not None:
-            from litellm.caching import RedisSemanticCache
+            from litellm.caching.caching import RedisSemanticCache
            cache_type = litellm.cache.type
--- a/litellm/proxy/hooks/azure_content_safety.py
+++ b/litellm/proxy/hooks/azure_content_safety.py
@ -7,7 +7,7 @@ from fastapi import HTTPException
 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy._types import UserAPIKeyAuth
--- a/litellm/proxy/hooks/batch_redis_get.py
+++ b/litellm/proxy/hooks/batch_redis_get.py
@ -11,7 +11,7 @@ from fastapi import HTTPException
 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache, InMemoryCache, RedisCache
+from litellm.caching.caching import DualCache, InMemoryCache, RedisCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy._types import UserAPIKeyAuth
--- a/litellm/proxy/hooks/cache_control_check.py
+++ b/litellm/proxy/hooks/cache_control_check.py
@ -7,7 +7,7 @@ from fastapi import HTTPException
 import litellm
 from litellm import verbose_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy._types import UserAPIKeyAuth
--- a/litellm/proxy/hooks/dynamic_rate_limiter.py
+++ b/litellm/proxy/hooks/dynamic_rate_limiter.py
@ -14,7 +14,7 @@ from fastapi import HTTPException
 import litellm
 from litellm import ModelResponse, Router
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.types.router import ModelGroupInfo
--- a/litellm/proxy/hooks/max_budget_limiter.py
+++ b/litellm/proxy/hooks/max_budget_limiter.py
@ -4,7 +4,7 @@ from fastapi import HTTPException
 import litellm
 from litellm import verbose_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy._types import UserAPIKeyAuth
--- a/litellm/proxy/hooks/parallel_request_limiter.py
+++ b/litellm/proxy/hooks/parallel_request_limiter.py
@ -9,7 +9,7 @@ from pydantic import BaseModel
 import litellm
 from litellm import ModelResponse
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
 from litellm.proxy._types import CurrentItemRateLimit, UserAPIKeyAuth
--- a/litellm/proxy/hooks/presidio_pii_masking.py
+++ b/litellm/proxy/hooks/presidio_pii_masking.py
@ -19,7 +19,7 @@ from fastapi import HTTPException
 import litellm  # noqa: E401
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.utils import (
--- a/litellm/proxy/hooks/prompt_injection_detection.py
+++ b/litellm/proxy/hooks/prompt_injection_detection.py
@ -18,7 +18,7 @@ from typing_extensions import overload
 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.llms.prompt_templates.factory import prompt_injection_detection_default_pt
 from litellm.proxy._types import LiteLLMPromptInjectionParams, UserAPIKeyAuth
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -112,7 +112,7 @@ from litellm import (
    RetrieveBatchRequest,
 )
 from litellm._logging import verbose_proxy_logger, verbose_router_logger
-from litellm.caching import DualCache, RedisCache
+from litellm.caching.caching import DualCache, RedisCache
 from litellm.exceptions import RejectedRequestError
 from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting
 from litellm.litellm_core_utils.core_helpers import (
@ -1554,7 +1554,7 @@ class ProxyConfig:
            for key, value in litellm_settings.items():
                if key == "cache" and value is True:
                    print(f"{blue_color_code}\nSetting Cache on Proxy")  # noqa
-                    from litellm.caching import Cache
+                    from litellm.caching.caching import Cache
                    cache_params = {}
                    if "cache_params" in litellm_settings:
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -49,7 +49,7 @@ from litellm import (
 )
 from litellm._logging import verbose_proxy_logger
 from litellm._service_logger import ServiceLogging, ServiceTypes
-from litellm.caching import DualCache, RedisCache
+from litellm.caching.caching import DualCache, RedisCache
 from litellm.exceptions import RejectedRequestError
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.integrations.custom_logger import CustomLogger
--- a/litellm/router.py
+++ b/litellm/router.py
@ -37,7 +37,7 @@ import litellm.litellm_core_utils.exception_mapping_utils
 from litellm import get_secret_str
 from litellm._logging import verbose_router_logger
 from litellm.assistants.main import AssistantDeleted
-from litellm.caching import DualCache, InMemoryCache, RedisCache
+from litellm.caching.caching import DualCache, InMemoryCache, RedisCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging
 from litellm.llms.AzureOpenAI.azure import get_azure_ad_token_from_oidc
--- a/litellm/router_strategy/least_busy.py
+++ b/litellm/router_strategy/least_busy.py
@ -14,7 +14,7 @@ from typing import Optional
 import dotenv  # type: ignore
 import requests
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
--- a/litellm/router_strategy/lowest_cost.py
+++ b/litellm/router_strategy/lowest_cost.py
@ -9,7 +9,7 @@ from pydantic import BaseModel
 import litellm
 from litellm import ModelResponse, token_counter, verbose_logger
 from litellm._logging import verbose_router_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
--- a/litellm/router_strategy/lowest_latency.py
+++ b/litellm/router_strategy/lowest_latency.py
@ -9,7 +9,7 @@ from pydantic import BaseModel
 import litellm
 from litellm import ModelResponse, token_counter, verbose_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
--- a/litellm/router_strategy/lowest_tpm_rpm.py
+++ b/litellm/router_strategy/lowest_tpm_rpm.py
@ -12,7 +12,7 @@ from pydantic import BaseModel
 from litellm import token_counter
 from litellm._logging import verbose_router_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.utils import print_verbose
--- a/litellm/router_strategy/lowest_tpm_rpm_v2.py
+++ b/litellm/router_strategy/lowest_tpm_rpm_v2.py
@ -10,7 +10,7 @@ from pydantic import BaseModel
 import litellm
 from litellm import token_counter
 from litellm._logging import verbose_logger, verbose_router_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.types.router import RouterErrors
 from litellm.utils import get_utc_datetime, print_verbose
--- a/litellm/router_utils/cooldown_cache.py
+++ b/litellm/router_utils/cooldown_cache.py
@ -7,7 +7,7 @@ import time
 from typing import List, Optional, Tuple, TypedDict
 from litellm import verbose_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 class CooldownCacheValue(TypedDict):
--- a/litellm/scheduler.py
+++ b/litellm/scheduler.py
@ -1,9 +1,11 @@
 import heapq
 from pydantic import BaseModel
 from typing import Optional
 import enum
-from litellm.caching import DualCache, RedisCache
+import heapq
 from typing import Optional
 from pydantic import BaseModel
 from litellm import print_verbose
 from litellm.caching.caching import DualCache, RedisCache
 class SchedulerCacheKeys(enum.Enum):
--- a/litellm/secret_managers/google_secret_manager.py
+++ b/litellm/secret_managers/google_secret_manager.py
@ -4,7 +4,7 @@ from typing import Optional
 import litellm
 from litellm._logging import verbose_logger
-from litellm.caching import InMemoryCache
+from litellm.caching.caching import InMemoryCache
 from litellm.integrations.gcs_bucket.gcs_bucket_base import GCSBucketBase
 from litellm.llms.custom_httpx.http_handler import _get_httpx_client
 from litellm.proxy._types import CommonProxyErrors, KeyManagementSystem
--- a/litellm/secret_managers/main.py
+++ b/litellm/secret_managers/main.py
@ -12,7 +12,7 @@ from dotenv import load_dotenv
 import litellm
 from litellm._logging import print_verbose, verbose_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.proxy._types import KeyManagementSystem
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -56,7 +56,10 @@ import litellm._service_logger  # for storing API inputs, outputs, and metadata
 import litellm.litellm_core_utils
 import litellm.litellm_core_utils.audio_utils.utils
 import litellm.litellm_core_utils.json_validation_rule
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler
 _llm_caching_handler = LLMCachingHandler()
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.litellm_core_utils.exception_mapping_utils import (
@ -146,7 +149,13 @@ from typing import (
 from openai import OpenAIError as OriginalError
 from ._logging import verbose_logger
-from .caching import Cache, QdrantSemanticCache, RedisCache, RedisSemanticCache, S3Cache
+from .caching.caching import (
    Cache,
    QdrantSemanticCache,
    RedisCache,
    RedisSemanticCache,
    S3Cache,
 )
 from .exceptions import (
    APIConnectionError,
    APIError,
@ -1121,299 +1130,26 @@ def client(original_function):
            print_verbose(
                f"ASYNC kwargs[caching]: {kwargs.get('caching', False)}; litellm.cache: {litellm.cache}; kwargs.get('cache'): {kwargs.get('cache', None)}"
            )
-            # if caching is false, don't run this
+            _caching_handler_response: CachingHandlerResponse = (
-            final_embedding_cached_response = None
+                await _llm_caching_handler._async_get_cache(
-
+                    model=model,
                    original_function=original_function,
                    logging_obj=logging_obj,
                    start_time=start_time,
                    call_type=call_type,
                    kwargs=kwargs,
                    args=args,
                )
            )
            if (
-                (kwargs.get("caching", None) is None and litellm.cache is not None)
+                _caching_handler_response.cached_result is not None
-                or kwargs.get("caching", False) is True
+                and _caching_handler_response.final_embedding_cached_response is None
-            ) and (
+            ):
-                kwargs.get("cache", {}).get("no-cache", False) is not True
+                return _caching_handler_response.cached_result
-            ):  # allow users to control returning cached responses from the completion function
+
-                # checking cache
+            elif _caching_handler_response.embedding_all_elements_cache_hit is True:
-                print_verbose("INSIDE CHECKING CACHE")
+                return _caching_handler_response.final_embedding_cached_response
                if (
                    litellm.cache is not None
                    and litellm.cache.supported_call_types is not None
                    and str(original_function.__name__)
                    in litellm.cache.supported_call_types
                ):
                    print_verbose("Checking Cache")
                    if call_type == CallTypes.aembedding.value and isinstance(
                        kwargs["input"], list
                    ):
                        tasks = []
                        for idx, i in enumerate(kwargs["input"]):
                            preset_cache_key = litellm.cache.get_cache_key(
                                *args, **{**kwargs, "input": i}
                            )
                            tasks.append(
                                litellm.cache.async_get_cache(
                                    cache_key=preset_cache_key
                                )
                            )
                        cached_result = await asyncio.gather(*tasks)
                        ## check if cached result is None ##
                        if cached_result is not None and isinstance(
                            cached_result, list
                        ):
                            if len(cached_result) == 1 and cached_result[0] is None:
                                cached_result = None
                    elif isinstance(
                        litellm.cache.cache, RedisSemanticCache
                    ) or isinstance(litellm.cache.cache, RedisCache):
                        preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
                        kwargs["preset_cache_key"] = (
                            preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
                        )
                        cached_result = await litellm.cache.async_get_cache(
                            *args, **kwargs
                        )
                    elif isinstance(litellm.cache.cache, QdrantSemanticCache):
                        preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
                        kwargs["preset_cache_key"] = (
                            preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
                        )
                        cached_result = await litellm.cache.async_get_cache(
                            *args, **kwargs
                        )
                    else:  # for s3 caching. [NOT RECOMMENDED IN PROD - this will slow down responses since boto3 is sync]
                        preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
                        kwargs["preset_cache_key"] = (
                            preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
                        )
                        cached_result = litellm.cache.get_cache(*args, **kwargs)
                    if cached_result is not None and not isinstance(
                        cached_result, list
                    ):
                        print_verbose("Cache Hit!", log_level="INFO")
                        cache_hit = True
                        end_time = datetime.datetime.now()
                        (
                            model,
                            custom_llm_provider,
                            dynamic_api_key,
                            api_base,
                        ) = litellm.get_llm_provider(
                            model=model,
                            custom_llm_provider=kwargs.get("custom_llm_provider", None),
                            api_base=kwargs.get("api_base", None),
                            api_key=kwargs.get("api_key", None),
                        )
                        print_verbose(
                            f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
                        )
                        logging_obj.update_environment_variables(
                            model=model,
                            user=kwargs.get("user", None),
                            optional_params={},
                            litellm_params={
                                "logger_fn": kwargs.get("logger_fn", None),
                                "acompletion": True,
                                "metadata": kwargs.get("metadata", {}),
                                "model_info": kwargs.get("model_info", {}),
                                "proxy_server_request": kwargs.get(
                                    "proxy_server_request", None
                                ),
                                "preset_cache_key": kwargs.get(
                                    "preset_cache_key", None
                                ),
                                "stream_response": kwargs.get("stream_response", {}),
                                "api_base": kwargs.get("api_base", ""),
                            },
                            input=kwargs.get("messages", ""),
                            api_key=kwargs.get("api_key", None),
                            original_response=str(cached_result),
                            additional_args=None,
                            stream=kwargs.get("stream", False),
                        )
                        call_type = original_function.__name__
                        if call_type == CallTypes.acompletion.value and isinstance(
                            cached_result, dict
                        ):
                            if kwargs.get("stream", False) is True:
                                cached_result = convert_to_streaming_response_async(
                                    response_object=cached_result,
                                )
                                cached_result = CustomStreamWrapper(
                                    completion_stream=cached_result,
                                    model=model,
                                    custom_llm_provider="cached_response",
                                    logging_obj=logging_obj,
                                )
                            else:
                                cached_result = convert_to_model_response_object(
                                    response_object=cached_result,
                                    model_response_object=ModelResponse(),
                                )
                        if (
                            call_type == CallTypes.atext_completion.value
                            and isinstance(cached_result, dict)
                        ):
                            if kwargs.get("stream", False) is True:
                                cached_result = convert_to_streaming_response_async(
                                    response_object=cached_result,
                                )
                                cached_result = CustomStreamWrapper(
                                    completion_stream=cached_result,
                                    model=model,
                                    custom_llm_provider="cached_response",
                                    logging_obj=logging_obj,
                                )
                            else:
                                cached_result = TextCompletionResponse(**cached_result)
                        elif call_type == CallTypes.aembedding.value and isinstance(
                            cached_result, dict
                        ):
                            cached_result = convert_to_model_response_object(
                                response_object=cached_result,
                                model_response_object=EmbeddingResponse(),
                                response_type="embedding",
                            )
                        elif call_type == CallTypes.arerank.value and isinstance(
                            cached_result, dict
                        ):
                            cached_result = convert_to_model_response_object(
                                response_object=cached_result,
                                model_response_object=None,
                                response_type="rerank",
                            )
                        elif call_type == CallTypes.atranscription.value and isinstance(
                            cached_result, dict
                        ):
                            hidden_params = {
                                "model": "whisper-1",
                                "custom_llm_provider": custom_llm_provider,
                                "cache_hit": True,
                            }
                            cached_result = convert_to_model_response_object(
                                response_object=cached_result,
                                model_response_object=TranscriptionResponse(),
                                response_type="audio_transcription",
                                hidden_params=hidden_params,
                            )
                        if kwargs.get("stream", False) is False:
                            # LOG SUCCESS
                            asyncio.create_task(
                                logging_obj.async_success_handler(
                                    cached_result, start_time, end_time, cache_hit
                                )
                            )
                            threading.Thread(
                                target=logging_obj.success_handler,
                                args=(cached_result, start_time, end_time, cache_hit),
                            ).start()
                        cache_key = kwargs.get("preset_cache_key", None)
                        if (
                            isinstance(cached_result, BaseModel)
                            or isinstance(cached_result, CustomStreamWrapper)
                        ) and hasattr(cached_result, "_hidden_params"):
                            cached_result._hidden_params["cache_key"] = cache_key  # type: ignore
                        return cached_result
                    elif (
                        call_type == CallTypes.aembedding.value
                        and cached_result is not None
                        and isinstance(cached_result, list)
                        and litellm.cache is not None
                        and not isinstance(
                            litellm.cache.cache, S3Cache
                        )  # s3 doesn't support bulk writing. Exclude.
                    ):
                        remaining_list = []
                        non_null_list = []
                        for idx, cr in enumerate(cached_result):
                            if cr is None:
                                remaining_list.append(kwargs["input"][idx])
                            else:
                                non_null_list.append((idx, cr))
                        original_kwargs_input = kwargs["input"]
                        kwargs["input"] = remaining_list
                        if len(non_null_list) > 0:
                            print_verbose(
                                f"EMBEDDING CACHE HIT! - {len(non_null_list)}"
                            )
                            final_embedding_cached_response = EmbeddingResponse(
                                model=kwargs.get("model"),
                                data=[None] * len(original_kwargs_input),
                            )
                            final_embedding_cached_response._hidden_params[
                                "cache_hit"
                            ] = True
                            for val in non_null_list:
                                idx, cr = val  # (idx, cr) tuple
                                if cr is not None:
                                    final_embedding_cached_response.data[idx] = (
                                        Embedding(
                                            embedding=cr["embedding"],
                                            index=idx,
                                            object="embedding",
                                        )
                                    )
                        if len(remaining_list) == 0:
                            # LOG SUCCESS
                            cache_hit = True
                            end_time = datetime.datetime.now()
                            (
                                model,
                                custom_llm_provider,
                                dynamic_api_key,
                                api_base,
                            ) = litellm.get_llm_provider(
                                model=model,
                                custom_llm_provider=kwargs.get(
                                    "custom_llm_provider", None
                                ),
                                api_base=kwargs.get("api_base", None),
                                api_key=kwargs.get("api_key", None),
                            )
                            print_verbose(
                                f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
                            )
                            logging_obj.update_environment_variables(
                                model=model,
                                user=kwargs.get("user", None),
                                optional_params={},
                                litellm_params={
                                    "logger_fn": kwargs.get("logger_fn", None),
                                    "acompletion": True,
                                    "metadata": kwargs.get("metadata", {}),
                                    "model_info": kwargs.get("model_info", {}),
                                    "proxy_server_request": kwargs.get(
                                        "proxy_server_request", None
                                    ),
                                    "preset_cache_key": kwargs.get(
                                        "preset_cache_key", None
                                    ),
                                    "stream_response": kwargs.get(
                                        "stream_response", {}
                                    ),
                                    "api_base": "",
                                },
                                input=kwargs.get("messages", ""),
                                api_key=kwargs.get("api_key", None),
                                original_response=str(final_embedding_cached_response),
                                additional_args=None,
                                stream=kwargs.get("stream", False),
                            )
                            asyncio.create_task(
                                logging_obj.async_success_handler(
                                    final_embedding_cached_response,
                                    start_time,
                                    end_time,
                                    cache_hit,
                                )
                            )
                            threading.Thread(
                                target=logging_obj.success_handler,
                                args=(
                                    final_embedding_cached_response,
                                    start_time,
                                    end_time,
                                    cache_hit,
                                ),
                            ).start()
                            return final_embedding_cached_response
            # MODEL CALL
            result = await original_function(*args, **kwargs)
            end_time = datetime.datetime.now()
@ -1467,51 +1203,14 @@ def client(original_function):
                original_response=result, model=model, optional_params=kwargs
            )
-            # [OPTIONAL] ADD TO CACHE
+            ## Add response to cache
-            if (
+            await _llm_caching_handler._async_set_cache(
-                (litellm.cache is not None)
+                result=result,
-                and litellm.cache.supported_call_types is not None
+                original_function=original_function,
-                and (
+                kwargs=kwargs,
-                    str(original_function.__name__)
+                args=args,
-                    in litellm.cache.supported_call_types
+            )
-                )
+
                and (kwargs.get("cache", {}).get("no-store", False) is not True)
            ):
                if (
                    isinstance(result, litellm.ModelResponse)
                    or isinstance(result, litellm.EmbeddingResponse)
                    or isinstance(result, TranscriptionResponse)
                    or isinstance(result, RerankResponse)
                ):
                    if (
                        isinstance(result, EmbeddingResponse)
                        and isinstance(kwargs["input"], list)
                        and litellm.cache is not None
                        and not isinstance(
                            litellm.cache.cache, S3Cache
                        )  # s3 doesn't support bulk writing. Exclude.
                    ):
                        asyncio.create_task(
                            litellm.cache.async_add_cache_pipeline(
                                result, *args, **kwargs
                            )
                        )
                    elif isinstance(litellm.cache.cache, S3Cache):
                        threading.Thread(
                            target=litellm.cache.add_cache,
                            args=(result,) + args,
                            kwargs=kwargs,
                        ).start()
                    else:
                        asyncio.create_task(
                            litellm.cache.async_add_cache(
                                result.json(), *args, **kwargs
                            )
                        )
                else:
                    asyncio.create_task(
                        litellm.cache.async_add_cache(result, *args, **kwargs)
                    )
            # LOG SUCCESS - handle streaming success logging in the _next_ object
            print_verbose(
                f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
@ -1528,24 +1227,32 @@ def client(original_function):
            # REBUILD EMBEDDING CACHING
            if (
                isinstance(result, EmbeddingResponse)
-                and final_embedding_cached_response is not None
+                and _caching_handler_response.final_embedding_cached_response
-                and final_embedding_cached_response.data is not None
+                is not None
                and _caching_handler_response.final_embedding_cached_response.data
                is not None
            ):
                idx = 0
                final_data_list = []
-                for item in final_embedding_cached_response.data:
+                for (
                    item
                ) in _caching_handler_response.final_embedding_cached_response.data:
                    if item is None and result.data is not None:
                        final_data_list.append(result.data[idx])
                        idx += 1
                    else:
                        final_data_list.append(item)
-                final_embedding_cached_response.data = final_data_list
+                _caching_handler_response.final_embedding_cached_response.data = (
-                final_embedding_cached_response._hidden_params["cache_hit"] = True
+                    final_data_list
-                final_embedding_cached_response._response_ms = (
+                )
                _caching_handler_response.final_embedding_cached_response._hidden_params[
                    "cache_hit"
                ] = True
                _caching_handler_response.final_embedding_cached_response._response_ms = (
                    end_time - start_time
                ).total_seconds() * 1000
-                return final_embedding_cached_response
+                return _caching_handler_response.final_embedding_cached_response
            return result
        except Exception as e:
--- a/tests/local_testing/test_add_update_models.py
+++ b/tests/local_testing/test_add_update_models.py
@ -19,7 +19,7 @@ from litellm._logging import verbose_proxy_logger
 from litellm.proxy.utils import PrismaClient, ProxyLogging
 verbose_proxy_logger.setLevel(level=logging.DEBUG)
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.router import (
    Deployment,
    updateDeployment,
--- a/tests/local_testing/test_alerting.py
+++ b/tests/local_testing/test_alerting.py
@ -28,7 +28,7 @@ import pytest
 from openai import APIError
 import litellm
-from litellm.caching import DualCache, RedisCache
+from litellm.caching.caching import DualCache, RedisCache
 from litellm.integrations.SlackAlerting.slack_alerting import (
    DeploymentMetrics,
    SlackAlerting,
--- a/tests/local_testing/test_auth_checks.py
+++ b/tests/local_testing/test_auth_checks.py
@ -13,7 +13,7 @@ sys.path.insert(
 )  # Adds the parent directory to the system path
 import pytest, litellm
 from litellm.proxy.auth.auth_checks import get_end_user_object
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import LiteLLM_EndUserTable, LiteLLM_BudgetTable
 from litellm.proxy.utils import PrismaClient
--- a/tests/local_testing/test_azure_content_safety.py
+++ b/tests/local_testing/test_azure_content_safety.py
@ -21,7 +21,7 @@ import pytest
 import litellm
 from litellm import Router, mock_completion
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.utils import ProxyLogging
--- a/tests/local_testing/test_banned_keyword_list.py
+++ b/tests/local_testing/test_banned_keyword_list.py
@ -21,7 +21,7 @@ from litellm.proxy.enterprise.enterprise_hooks.banned_keywords import (
 from litellm import Router, mock_completion
 from litellm.proxy.utils import ProxyLogging, hash_token
 from litellm.proxy._types import UserAPIKeyAuth
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
@pytest.mark.asyncio
--- a/tests/local_testing/test_blocked_user_list.py
+++ b/tests/local_testing/test_blocked_user_list.py
@ -27,7 +27,7 @@ import pytest
 import litellm
 from litellm import Router, mock_completion
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.enterprise.enterprise_hooks.blocked_user_list import (
    _ENTERPRISE_BlockedUserList,
@ -56,7 +56,7 @@ verbose_proxy_logger.setLevel(level=logging.DEBUG)
 from starlette.datastructures import URL
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import (
    BlockUsers,
    DynamoDBArgs,
--- a/tests/local_testing/test_caching.py
+++ b/tests/local_testing/test_caching.py
@ -21,7 +21,7 @@ import pytest
 import litellm
 from litellm import aembedding, completion, embedding
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 from unittest.mock import AsyncMock, patch, MagicMock
 import datetime
@ -52,7 +52,7 @@ async def test_dual_cache_async_batch_get_cache():
    - hit redis for the other -> expect to return None
    - expect result = [in_memory_result, None]
    """
-    from litellm.caching import DualCache, InMemoryCache, RedisCache
+    from litellm.caching.caching import DualCache, InMemoryCache, RedisCache
    in_memory_cache = InMemoryCache()
    redis_cache = RedisCache()  # get credentials from environment
@ -74,7 +74,7 @@ def test_dual_cache_batch_get_cache():
    - hit redis for the other -> expect to return None
    - expect result = [in_memory_result, None]
    """
-    from litellm.caching import DualCache, InMemoryCache, RedisCache
+    from litellm.caching.caching import DualCache, InMemoryCache, RedisCache
    in_memory_cache = InMemoryCache()
    redis_cache = RedisCache()  # get credentials from environment
@ -520,6 +520,7 @@ async def test_embedding_caching_azure_individual_items_reordered():
    assert embedding_val_1[0]["id"] == embedding_val_2[0]["id"]
    ```
    """
    litellm.set_verbose = True
    litellm.cache = Cache()
    common_msg = f"{uuid.uuid4()}"
    common_msg_2 = f"hey how's it going {uuid.uuid4()}"
@ -532,9 +533,11 @@ async def test_embedding_caching_azure_individual_items_reordered():
    embedding_val_1 = await aembedding(
        model="azure/azure-embedding-model", input=embedding_1, caching=True
    )
    print("embedding val 1", embedding_val_1)
    embedding_val_2 = await aembedding(
        model="azure/azure-embedding-model", input=embedding_2, caching=True
    )
    print("embedding val 2", embedding_val_2)
    print(f"embedding_val_2._hidden_params: {embedding_val_2._hidden_params}")
    assert embedding_val_2._hidden_params["cache_hit"] == True
@ -866,7 +869,7 @@ async def test_redis_cache_cluster_init_unit_test():
        from redis.asyncio import RedisCluster as AsyncRedisCluster
        from redis.cluster import RedisCluster
-        from litellm.caching import RedisCache
+        from litellm.caching.caching import RedisCache
        litellm.set_verbose = True
@ -900,7 +903,7 @@ async def test_redis_cache_cluster_init_with_env_vars_unit_test():
        from redis.asyncio import RedisCluster as AsyncRedisCluster
        from redis.cluster import RedisCluster
-        from litellm.caching import RedisCache
+        from litellm.caching.caching import RedisCache
        litellm.set_verbose = True
@ -1554,7 +1557,7 @@ def test_custom_redis_cache_params():
 def test_get_cache_key():
-    from litellm.caching import Cache
+    from litellm.caching.caching import Cache
    try:
        print("Testing get_cache_key")
@ -1989,7 +1992,7 @@ async def test_cache_default_off_acompletion():
    verbose_logger.setLevel(logging.DEBUG)
-    from litellm.caching import CacheMode
+    from litellm.caching.caching import CacheMode
    random_number = random.randint(
        1, 100000
@ -2072,7 +2075,7 @@ async def test_dual_cache_uses_redis():
    - Assert that value from redis is used
    """
    litellm.set_verbose = True
-    from litellm.caching import DualCache, RedisCache
+    from litellm.caching.caching import DualCache, RedisCache
    current_usage = uuid.uuid4()
@ -2095,7 +2098,7 @@ async def test_proxy_logging_setup():
    """
    Assert always_read_redis is True when used by internal usage cache
    """
-    from litellm.caching import DualCache
+    from litellm.caching.caching import DualCache
    from litellm.proxy.utils import ProxyLogging
    pl_obj = ProxyLogging(user_api_key_cache=DualCache())
@ -2165,7 +2168,7 @@ async def test_redis_proxy_batch_redis_get_cache():
    - make 2nd call -> expect hit
    """
-    from litellm.caching import Cache, DualCache
+    from litellm.caching.caching import Cache, DualCache
    from litellm.proxy._types import UserAPIKeyAuth
    from litellm.proxy.hooks.batch_redis_get import _PROXY_BatchRedisRequests
--- a/tests/local_testing/test_caching_ssl.py
+++ b/tests/local_testing/test_caching_ssl.py
@ -15,7 +15,7 @@ sys.path.insert(
 import pytest
 import litellm
 from litellm import embedding, completion, Router
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 messages = [{"role": "user", "content": f"who is ishaan {time.time()}"}]
--- a/tests/local_testing/test_datadog.py
+++ b/tests/local_testing/test_datadog.py
@ -151,7 +151,7 @@ async def test_datadog_log_redis_failures():
    Test that poorly configured Redis is logged as Warning on DataDog
    """
    try:
-        from litellm.caching import Cache
+        from litellm.caching.caching import Cache
        from litellm.integrations.datadog.datadog import DataDogLogger
        litellm.cache = Cache(
--- a/tests/local_testing/test_jwt.py
+++ b/tests/local_testing/test_jwt.py
@ -24,7 +24,7 @@ import pytest
 from fastapi import Request
 import litellm
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import LiteLLM_JWTAuth, LiteLLM_UserTable, LiteLLMRoutes
 from litellm.proxy.auth.handle_jwt import JWTHandler
 from litellm.proxy.management_endpoints.team_endpoints import new_team
--- a/tests/local_testing/test_key_generate_prisma.py
+++ b/tests/local_testing/test_key_generate_prisma.py
@ -89,7 +89,7 @@ verbose_proxy_logger.setLevel(level=logging.DEBUG)
 from starlette.datastructures import URL
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import (
    DynamoDBArgs,
    GenerateKeyRequest,
@ -1444,7 +1444,7 @@ def test_call_with_key_over_budget(prisma_client):
            # update spend using track_cost callback, make 2nd request, it should fail
            from litellm import Choices, Message, ModelResponse, Usage
-            from litellm.caching import Cache
+            from litellm.caching.caching import Cache
            from litellm.proxy.proxy_server import (
                _PROXY_track_cost_callback as track_cost_callback,
            )
@ -1564,7 +1564,7 @@ def test_call_with_key_over_budget_no_cache(prisma_client):
            setattr(litellm.proxy.proxy_server, "proxy_batch_write_at", 1)
            from litellm import Choices, Message, ModelResponse, Usage
-            from litellm.caching import Cache
+            from litellm.caching.caching import Cache
            litellm.cache = Cache()
            import time
@ -1685,7 +1685,7 @@ def test_call_with_key_over_model_budget(prisma_client):
            # update spend using track_cost callback, make 2nd request, it should fail
            from litellm import Choices, Message, ModelResponse, Usage
-            from litellm.caching import Cache
+            from litellm.caching.caching import Cache
            from litellm.proxy.proxy_server import (
                _PROXY_track_cost_callback as track_cost_callback,
            )
--- a/tests/local_testing/test_lakera_ai_prompt_injection.py
+++ b/tests/local_testing/test_lakera_ai_prompt_injection.py
@ -25,7 +25,7 @@ import pytest
 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.guardrails.guardrail_hooks.lakera_ai import lakeraAI_Moderation
 from litellm.proxy.proxy_server import embeddings
--- a/tests/local_testing/test_least_busy_routing.py
+++ b/tests/local_testing/test_least_busy_routing.py
@ -20,7 +20,7 @@ import pytest
 import litellm
 from litellm import Router
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.router_strategy.least_busy import LeastBusyLoggingHandler
 ### UNIT TESTS FOR LEAST BUSY LOGGING ###
--- a/tests/local_testing/test_llm_guard.py
+++ b/tests/local_testing/test_llm_guard.py
@ -20,7 +20,7 @@ from litellm.proxy.enterprise.enterprise_hooks.llm_guard import _ENTERPRISE_LLMG
 from litellm import Router, mock_completion
 from litellm.proxy.utils import ProxyLogging, hash_token
 from litellm.proxy._types import UserAPIKeyAuth
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 ### UNIT TESTS FOR LLM GUARD ###
--- a/tests/local_testing/test_load_test_router_s3.py
+++ b/tests/local_testing/test_load_test_router_s3.py
@ -10,7 +10,7 @@
 # import asyncio
 # from litellm import Router, Timeout
 # import time
-# from litellm.caching import Cache
+# from litellm.caching.caching import Cache
 # import litellm
 # litellm.cache = Cache(
--- a/tests/local_testing/test_lowest_cost_routing.py
+++ b/tests/local_testing/test_lowest_cost_routing.py
@ -15,7 +15,7 @@ sys.path.insert(
 import pytest
 from litellm import Router
 from litellm.router_strategy.lowest_cost import LowestCostLoggingHandler
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 ### UNIT TESTS FOR cost ROUTING ###
--- a/tests/local_testing/test_lowest_latency_routing.py
+++ b/tests/local_testing/test_lowest_latency_routing.py
@ -22,7 +22,7 @@ import pytest
 import litellm
 from litellm import Router
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.router_strategy.lowest_latency import LowestLatencyLoggingHandler
 ### UNIT TESTS FOR LATENCY ROUTING ###
--- a/tests/local_testing/test_max_tpm_rpm_limiter.py
+++ b/tests/local_testing/test_max_tpm_rpm_limiter.py
@ -19,7 +19,7 @@
 # from litellm import Router
 # from litellm.proxy.utils import ProxyLogging, hash_token
 # from litellm.proxy._types import UserAPIKeyAuth
-# from litellm.caching import DualCache, RedisCache
+# from litellm.caching.caching import DualCache, RedisCache
 # from litellm.proxy.hooks.tpm_rpm_limiter import _PROXY_MaxTPMRPMLimiter
 # from datetime import datetime
--- a/tests/local_testing/test_openai_moderations_hook.py
+++ b/tests/local_testing/test_openai_moderations_hook.py
@ -22,7 +22,7 @@ from litellm.proxy.enterprise.enterprise_hooks.openai_moderation import (
 from litellm import Router, mock_completion
 from litellm.proxy.utils import ProxyLogging, hash_token
 from litellm.proxy._types import UserAPIKeyAuth
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 ### UNIT TESTS FOR OpenAI Moderation ###
--- a/tests/local_testing/test_parallel_request_limiter.py
+++ b/tests/local_testing/test_parallel_request_limiter.py
@ -23,7 +23,7 @@ import pytest
 import litellm
 from litellm import Router
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.hooks.parallel_request_limiter import (
    _PROXY_MaxParallelRequestsHandler as MaxParallelRequestsHandler,
--- a/tests/local_testing/test_presidio_masking.py
+++ b/tests/local_testing/test_presidio_masking.py
@ -22,7 +22,7 @@ import pytest
 import litellm
 from litellm import Router, mock_completion
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.hooks.presidio_pii_masking import _OPTIONAL_PresidioPIIMasking
 from litellm.proxy.utils import ProxyLogging
--- a/tests/local_testing/test_prometheus_service.py
+++ b/tests/local_testing/test_prometheus_service.py
@ -67,7 +67,7 @@ async def test_completion_with_caching_bad_call():
    litellm.set_verbose = True
    try:
-        from litellm.caching import RedisCache
+        from litellm.caching.caching import RedisCache
        litellm.service_callback = ["prometheus_system"]
        sl = ServiceLogging(mock_testing=True)
--- a/tests/local_testing/test_prompt_injection_detection.py
+++ b/tests/local_testing/test_prompt_injection_detection.py
@ -20,7 +20,7 @@ from litellm.proxy.hooks.prompt_injection_detection import (
 from litellm import Router, mock_completion
 from litellm.proxy.utils import ProxyLogging
 from litellm.proxy._types import UserAPIKeyAuth, LiteLLMPromptInjectionParams
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
@pytest.mark.asyncio
--- a/tests/local_testing/test_proxy_reject_logging.py
+++ b/tests/local_testing/test_proxy_reject_logging.py
@ -31,7 +31,7 @@ from starlette.datastructures import URL
 import litellm
 from litellm import Router, mock_completion
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.enterprise.enterprise_hooks.secret_detection import (
--- a/tests/local_testing/test_proxy_server.py
+++ b/tests/local_testing/test_proxy_server.py
@ -745,7 +745,7 @@ async def test_team_update_redis():
    """
    Tests if team update, updates the redis cache if set
    """
-    from litellm.caching import DualCache, RedisCache
+    from litellm.caching.caching import DualCache, RedisCache
    from litellm.proxy._types import LiteLLM_TeamTableCachedObj
    from litellm.proxy.auth.auth_checks import _cache_team_object
@ -775,7 +775,7 @@ async def test_get_team_redis(client_no_auth):
    """
    Tests if get_team_object gets value from redis cache, if set
    """
-    from litellm.caching import DualCache, RedisCache
+    from litellm.caching.caching import DualCache, RedisCache
    from litellm.proxy.auth.auth_checks import get_team_object
    proxy_logging_obj: ProxyLogging = getattr(
--- a/tests/local_testing/test_secret_detect_hook.py
+++ b/tests/local_testing/test_secret_detect_hook.py
@ -26,7 +26,7 @@ from starlette.datastructures import URL
 import litellm
 from litellm import Router, mock_completion
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.enterprise.enterprise_hooks.secret_detection import (
--- a/tests/local_testing/test_streaming.py
+++ b/tests/local_testing/test_streaming.py
@ -3128,7 +3128,7 @@ async def test_azure_astreaming_and_function_calling():
            "content": f"What is the weather like in Boston? {uuid.uuid4()}",
        }
    ]
-    from litellm.caching import Cache
+    from litellm.caching.caching import Cache
    litellm.cache = Cache(
        type="redis",
--- a/tests/local_testing/test_tpm_rpm_routing_v2.py
+++ b/tests/local_testing/test_tpm_rpm_routing_v2.py
@ -23,7 +23,7 @@ import pytest
 import litellm
 from litellm import Router
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.router_strategy.lowest_tpm_rpm_v2 import (
    LowestTPMLoggingHandler_v2 as LowestTPMLoggingHandler,
 )
--- a/tests/local_testing/test_update_spend.py
+++ b/tests/local_testing/test_update_spend.py
@ -27,7 +27,7 @@ import pytest
 import litellm
 from litellm import Router, mock_completion
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.management_endpoints.internal_user_endpoints import (
    new_user,
@ -53,7 +53,7 @@ verbose_proxy_logger.setLevel(level=logging.DEBUG)
 from starlette.datastructures import URL
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import (
    BlockUsers,
    DynamoDBArgs,
--- a/tests/local_testing/test_whisper.py
+++ b/tests/local_testing/test_whisper.py
@ -157,7 +157,7 @@ async def test_transcription_on_router():
@pytest.mark.asyncio()
 async def test_transcription_caching():
    import litellm
-    from litellm.caching import Cache
+    from litellm.caching.caching import Cache
    litellm.set_verbose = True
    litellm.cache = Cache()
--- a/tests/proxy_admin_ui_tests/test_key_management.py
+++ b/tests/proxy_admin_ui_tests/test_key_management.py
@ -71,7 +71,7 @@ verbose_proxy_logger.setLevel(level=logging.DEBUG)
 from starlette.datastructures import URL
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import (
    DynamoDBArgs,
    GenerateKeyRequest,
--- a/tests/proxy_admin_ui_tests/test_role_based_access.py
+++ b/tests/proxy_admin_ui_tests/test_role_based_access.py
@ -78,7 +78,7 @@ verbose_proxy_logger.setLevel(level=logging.DEBUG)
 from starlette.datastructures import URL
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import *
 proxy_logging_obj = ProxyLogging(user_api_key_cache=DualCache())
--- a/tests/proxy_admin_ui_tests/test_sso_sign_in.py
+++ b/tests/proxy_admin_ui_tests/test_sso_sign_in.py
@ -17,7 +17,7 @@ from litellm.proxy._types import LitellmUserRoles
 import os
 import jwt
 import time
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 proxy_logging_obj = ProxyLogging(user_api_key_cache=DualCache())
--- a/tests/proxy_admin_ui_tests/test_usage_endpoints.py
+++ b/tests/proxy_admin_ui_tests/test_usage_endpoints.py
@ -85,7 +85,7 @@ verbose_proxy_logger.setLevel(level=logging.DEBUG)
 from starlette.datastructures import URL
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import (
    DynamoDBArgs,
    GenerateKeyRequest,