(refactor) caching use LLMCachingHandler for async_get_cache and set_cache (#6208)

* use folder for caching * fix importing caching * fix clickhouse pyright * fix linting * fix correctly pass kwargs and args * fix test case for embedding * fix linting * fix embedding caching logic * fix refactor handle utils.py * fix test_embedding_caching_azure_individual_items_reordered
2024-10-14 16:34:01 +05:30 · 2024-10-14 16:34:01 +05:30 · 4d1b4beb3d
commit 4d1b4beb3d
parent 20e50d7002
96 changed files with 690 additions and 489 deletions
--- a/cookbook/litellm_router_load_test/test_loadtest_openai_client.py
+++ b/cookbook/litellm_router_load_test/test_loadtest_openai_client.py
@ -10,7 +10,7 @@ sys.path.insert(
 import asyncio
 from litellm import Router, Timeout
 import time
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 import litellm
 import openai

--- a/cookbook/litellm_router_load_test/test_loadtest_router_withs3_cache.py
+++ b/cookbook/litellm_router_load_test/test_loadtest_router_withs3_cache.py
@ -10,7 +10,7 @@ sys.path.insert(
 import asyncio
 from litellm import Router, Timeout
 import time
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 import litellm

 litellm.cache = Cache(
--- a/docs/my-website/docs/caching/all_caches.md
+++ b/docs/my-website/docs/caching/all_caches.md
@ -3,7 +3,7 @@ import TabItem from '@theme/TabItem';

 # Caching - In-Memory, Redis, s3, Redis Semantic Cache, Disk

-[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py)
+[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm.caching.caching.py)

 :::info

@ -31,7 +31,7 @@ For the hosted version you can setup your own Redis DB here: https://app.redisla
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache

 litellm.cache = Cache(type="redis", host=<host>, port=<port>, password=<password>)

@ -68,7 +68,7 @@ AWS_SECRET_ACCESS_KEY = "WOl*****"
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache

 # pass s3-bucket name
 litellm.cache = Cache(type="s3", s3_bucket_name="cache-bucket-litellm", s3_region_name="us-west-2")
@ -101,7 +101,7 @@ For the hosted version you can setup your own Redis DB here: https://app.redisla
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache

 random_number = random.randint(
    1, 100000
@ -155,7 +155,7 @@ To set up a Qdrant cluster locally follow: https://qdrant.tech/documentation/qui
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache

 random_number = random.randint(
    1, 100000
@ -210,7 +210,7 @@ assert response1.id == response2.id
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 litellm.cache = Cache()

 # Make completion calls
@ -246,7 +246,7 @@ Then you can use the disk cache as follows.
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 litellm.cache = Cache(type="disk")

 # Make completion calls
@ -422,7 +422,7 @@ def custom_get_cache_key(*args, **kwargs):

 Set your function as litellm.cache.get_cache_key
 ```python
-from litellm.caching import Cache
+from litellm.caching.caching import Cache

 cache = Cache(type="redis", host=os.environ['REDIS_HOST'], port=os.environ['REDIS_PORT'], password=os.environ['REDIS_PASSWORD'])

@ -434,7 +434,7 @@ litellm.cache = cache # set litellm.cache to your cache
 ## How to write custom add/get cache functions 
 ### 1. Init Cache 
 ```python
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 cache = Cache()
 ``` 

--- a/docs/my-website/docs/caching/caching_api.md
+++ b/docs/my-website/docs/caching/caching_api.md
@ -6,7 +6,7 @@ Use api.litellm.ai for caching `completion()` and `embedding()` responses
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 litellm.cache = Cache(type="hosted") # init cache to use api.litellm.ai

 # Make completion calls
@ -31,7 +31,7 @@ response2 = completion(
 import time
 import litellm
 from litellm import completion, embedding
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 litellm.cache = Cache(type="hosted")

 start_time = time.time()
@ -53,7 +53,7 @@ LiteLLM can cache your streamed responses for you
 import litellm
 import time
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache

 litellm.cache = Cache(type="hosted")

--- a/docs/my-website/docs/caching/local_caching.md
+++ b/docs/my-website/docs/caching/local_caching.md
@ -13,7 +13,7 @@ Keys in the cache are `model`, the following example will lead to a cache hit
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 litellm.cache = Cache()

 # Make completion calls
@ -35,7 +35,7 @@ response2 = completion(
 Add custom key-value pairs to your cache. 

 ```python 
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 cache = Cache()

 cache.add_cache(cache_key="test-key", result="1234")
@ -50,7 +50,7 @@ LiteLLM can cache your streamed responses for you
 ```python
 import litellm
 from litellm import completion
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 litellm.cache = Cache()

 # Make completion calls
@ -77,7 +77,7 @@ Keys in the cache are `model`, the following example will lead to a cache hit
 import time
 import litellm
 from litellm import embedding
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 litellm.cache = Cache()

 start_time = time.time()
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@ -49,13 +49,13 @@ litellm_settings:
  cache: true 
  cache_params:        # set cache params for redis
    type: redis
-    namespace: "litellm_caching"
+    namespace: "litellm.caching.caching"
 ```

 and keys will be stored like:

 ```
-litellm_caching:<hash>
+litellm.caching.caching:<hash>
 ```

 #### Redis Cluster 
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -645,7 +645,7 @@ litellm_settings:
    host: "localhost"  # The host address for the Redis cache. Required if type is "redis".
    port: 6379  # The port number for the Redis cache. Required if type is "redis".
    password: "your_password"  # The password for the Redis cache. Required if type is "redis".
-    namespace: "litellm_caching" # namespace for redis cache
+    namespace: "litellm.caching.caching" # namespace for redis cache
  
    # Optional - Redis Cluster Settings
    redis_startup_nodes: [{"host": "127.0.0.1", "port": "7001"}] 
--- a/docs/my-website/docs/proxy/guardrails/custom_guardrail.md
+++ b/docs/my-website/docs/proxy/guardrails/custom_guardrail.md
@ -25,7 +25,7 @@ from typing import Any, Dict, List, Literal, Optional, Union

 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.guardrails.guardrail_helpers import should_proceed_based_on_metadata
--- a/enterprise/enterprise_callbacks/generic_api_callback.py
+++ b/enterprise/enterprise_callbacks/generic_api_callback.py
@ -6,7 +6,7 @@ import dotenv, os
 import requests

 from litellm.proxy._types import UserAPIKeyAuth
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache

 from typing import Literal, Union, Optional

--- a/enterprise/enterprise_hooks/aporia_ai.py
+++ b/enterprise/enterprise_hooks/aporia_ai.py
@ -13,7 +13,7 @@ sys.path.insert(
 )  # Adds the parent directory to the system path
 from typing import Optional, Literal, Union, Any
 import litellm, traceback, sys, uuid
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from fastapi import HTTPException
--- a/enterprise/enterprise_hooks/banned_keywords.py
+++ b/enterprise/enterprise_hooks/banned_keywords.py
@ -9,7 +9,7 @@

 from typing import Optional, Literal
 import litellm
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.integrations.custom_logger import CustomLogger
 from litellm._logging import verbose_proxy_logger
--- a/enterprise/enterprise_hooks/blocked_user_list.py
+++ b/enterprise/enterprise_hooks/blocked_user_list.py
@ -10,7 +10,7 @@
 from typing import Optional, Literal
 import litellm
 from litellm.proxy.utils import PrismaClient
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth, LiteLLM_EndUserTable
 from litellm.integrations.custom_logger import CustomLogger
 from litellm._logging import verbose_proxy_logger
--- a/enterprise/enterprise_hooks/google_text_moderation.py
+++ b/enterprise/enterprise_hooks/google_text_moderation.py
@ -9,7 +9,7 @@

 from typing import Optional, Literal, Union
 import litellm, traceback, sys, uuid
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.integrations.custom_logger import CustomLogger
 from fastapi import HTTPException
--- a/enterprise/enterprise_hooks/llama_guard.py
+++ b/enterprise/enterprise_hooks/llama_guard.py
@ -15,7 +15,7 @@ sys.path.insert(
 )  # Adds the parent directory to the system path
 from typing import Optional, Literal, Union
 import litellm, traceback, sys, uuid
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.integrations.custom_logger import CustomLogger
 from fastapi import HTTPException
--- a/enterprise/enterprise_hooks/llm_guard.py
+++ b/enterprise/enterprise_hooks/llm_guard.py
@ -13,7 +13,7 @@ import traceback
 import sys
 import uuid
 import os
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.integrations.custom_logger import CustomLogger
 from fastapi import HTTPException
--- a/enterprise/enterprise_hooks/openai_moderation.py
+++ b/enterprise/enterprise_hooks/openai_moderation.py
@ -12,7 +12,7 @@ sys.path.insert(
 )  # Adds the parent directory to the system path
 from typing import Optional, Literal, Union
 import litellm, traceback, sys, uuid
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.integrations.custom_logger import CustomLogger
 from fastapi import HTTPException
--- a/enterprise/enterprise_hooks/secret_detection.py
+++ b/enterprise/enterprise_hooks/secret_detection.py
@ -12,7 +12,7 @@ sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 from typing import Optional
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm._logging import verbose_proxy_logger
 import tempfile
--- a/litellm/init.py
+++ b/litellm/init.py
@ -7,7 +7,7 @@ import threading
 import os
 from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 from litellm._logging import (
    set_verbose,
    _turn_on_debug,
--- a/litellm/batch_completion/main.py
+++ b/litellm/batch_completion/main.py
@ -2,7 +2,6 @@ from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
 from typing import List, Optional

 import litellm
-from litellm import completion
 from litellm._logging import print_verbose
 from litellm.utils import get_optional_params

@ -108,7 +107,7 @@ def batch_completion(
                    if "kwargs" in kwargs_modified:
                        original_kwargs = kwargs_modified.pop("kwargs")
                    future = executor.submit(
-                        completion, **kwargs_modified, **original_kwargs
+                        litellm.completion, **kwargs_modified, **original_kwargs
                    )
                    completions.append(future)

@ -156,7 +155,7 @@ def batch_completion_models(*args, **kwargs):
        with ThreadPoolExecutor(max_workers=len(models)) as executor:
            for model in models:
                futures[model] = executor.submit(
-                    completion, *args, model=model, **kwargs
+                    litellm.completion, *args, model=model, **kwargs
                )

            for model, future in sorted(
@ -178,7 +177,9 @@ def batch_completion_models(*args, **kwargs):
                    ):  # don't override deployment values e.g. model name, api base, etc.
                        deployment[key] = kwargs[key]
                kwargs = {**deployment, **nested_kwargs}
-                futures[deployment["model"]] = executor.submit(completion, **kwargs)
+                futures[deployment["model"]] = executor.submit(
+                    litellm.completion, **kwargs
+                )

            while futures:
                # wait for the first returned future
@ -246,7 +247,7 @@ def batch_completion_models_all_responses(*args, **kwargs):

    with concurrent.futures.ThreadPoolExecutor(max_workers=len(models)) as executor:
        for idx, model in enumerate(models):
-            future = executor.submit(completion, *args, model=model, **kwargs)
+            future = executor.submit(litellm.completion, *args, model=model, **kwargs)
            if future.result() is not None:
                responses.append(future.result())

--- a/litellm/caching/caching.py
+++ b/litellm/caching/caching.py
@ -212,7 +212,7 @@ class RedisCache(BaseCache):

        from litellm._service_logger import ServiceLogging

-        from ._redis import get_redis_client, get_redis_connection_pool
+        from .._redis import get_redis_client, get_redis_connection_pool

        redis_kwargs = {}
        if host is not None:
@ -276,7 +276,7 @@ class RedisCache(BaseCache):
            )

    def init_async_client(self):
-        from ._redis import get_redis_async_client
+        from .._redis import get_redis_async_client

        return get_redis_async_client(
            connection_pool=self.async_redis_conn_pool, **self.redis_kwargs
@ -302,7 +302,7 @@ class RedisCache(BaseCache):
        except Exception as e:
            # NON blocking - notify users Redis is throwing an exception
            print_verbose(
-                f"LiteLLM Caching: set() - Got exception from REDIS : {str(e)}"
+                f"litellm.caching.caching: set() - Got exception from REDIS : {str(e)}"
            )

    def increment_cache(
@ -705,7 +705,7 @@ class RedisCache(BaseCache):
        except Exception as e:
            # NON blocking - notify users Redis is throwing an exception
            verbose_logger.error(
-                "LiteLLM Caching: get() - Got exception from REDIS: ", e
+                "litellm.caching.caching: get() - Got exception from REDIS: ", e
            )

    def batch_get_cache(self, key_list) -> dict:
@ -781,7 +781,7 @@ class RedisCache(BaseCache):
                )
                # NON blocking - notify users Redis is throwing an exception
                print_verbose(
-                    f"LiteLLM Caching: async get() - Got exception from REDIS: {str(e)}"
+                    f"litellm.caching.caching: async get() - Got exception from REDIS: {str(e)}"
                )

    async def async_batch_get_cache(self, key_list) -> dict:
--- a/litellm/caching/caching_handler.py
+++ b/litellm/caching/caching_handler.py
@ -0,0 +1,440 @@
+"""
+This contains LLMCachingHandler 
+
+This exposes two methods:
+    - async_get_cache
+    - async_set_cache
+
+This file is a wrapper around caching.py
+
+In each method it will call the appropriate method from caching.py
+"""
+
+import asyncio
+import datetime
+import threading
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple
+
+from pydantic import BaseModel
+
+import litellm
+from litellm._logging import print_verbose
+from litellm.caching.caching import (
+    Cache,
+    QdrantSemanticCache,
+    RedisCache,
+    RedisSemanticCache,
+    S3Cache,
+)
+from litellm.types.rerank import RerankResponse
+from litellm.types.utils import (
+    CallTypes,
+    Embedding,
+    EmbeddingResponse,
+    ModelResponse,
+    TextCompletionResponse,
+    TranscriptionResponse,
+)
+
+if TYPE_CHECKING:
+    from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
+else:
+    LiteLLMLoggingObj = Any
+
+
+class CachingHandlerResponse(BaseModel):
+    """
+    This is the response object for the caching handler. We need to separate embedding cached responses and (completion / text_completion / transcription) cached responses
+
+    For embeddings there can be a cache hit for some of the inputs in the list and a cache miss for others
+    """
+
+    cached_result: Optional[Any] = None
+    final_embedding_cached_response: Optional[EmbeddingResponse] = None
+    embedding_all_elements_cache_hit: bool = (
+        False  # this is set to True when all elements in the list have a cache hit in the embedding cache, if true return the final_embedding_cached_response no need to make an API call
+    )
+
+
+class LLMCachingHandler:
+    def __init__(self):
+        pass
+
+    async def _async_get_cache(
+        self,
+        model: str,
+        original_function: Callable,
+        logging_obj: LiteLLMLoggingObj,
+        start_time: datetime.datetime,
+        call_type: str,
+        kwargs: Dict[str, Any],
+        args: Optional[Tuple[Any, ...]] = None,
+    ) -> CachingHandlerResponse:
+        """
+        Internal method to get from the cache.
+        Handles different call types (embeddings, chat/completions, text_completion, transcription)
+        and accordingly returns the cached response
+
+        Args:
+            model: str:
+            original_function: Callable:
+            logging_obj: LiteLLMLoggingObj:
+            start_time: datetime.datetime:
+            call_type: str:
+            kwargs: Dict[str, Any]:
+            args: Optional[Tuple[Any, ...]] = None:
+
+
+        Returns:
+            CachingHandlerResponse:
+        Raises:
+            None
+        """
+        from litellm.utils import (
+            CustomStreamWrapper,
+            convert_to_model_response_object,
+            convert_to_streaming_response_async,
+        )
+
+        args = args or ()
+
+        final_embedding_cached_response: Optional[EmbeddingResponse] = None
+        cached_result: Optional[Any] = None
+        if (
+            (kwargs.get("caching", None) is None and litellm.cache is not None)
+            or kwargs.get("caching", False) is True
+        ) and (
+            kwargs.get("cache", {}).get("no-cache", False) is not True
+        ):  # allow users to control returning cached responses from the completion function
+            # checking cache
+            print_verbose("INSIDE CHECKING CACHE")
+            if (
+                litellm.cache is not None
+                and litellm.cache.supported_call_types is not None
+                and str(original_function.__name__)
+                in litellm.cache.supported_call_types
+            ):
+                print_verbose("Checking Cache")
+                if call_type == CallTypes.aembedding.value and isinstance(
+                    kwargs["input"], list
+                ):
+                    tasks = []
+                    for idx, i in enumerate(kwargs["input"]):
+                        preset_cache_key = litellm.cache.get_cache_key(
+                            *args, **{**kwargs, "input": i}
+                        )
+                        tasks.append(
+                            litellm.cache.async_get_cache(cache_key=preset_cache_key)
+                        )
+                    cached_result = await asyncio.gather(*tasks)
+                    ## check if cached result is None ##
+                    if cached_result is not None and isinstance(cached_result, list):
+                        # set cached_result to None if all elements are None
+                        if all(result is None for result in cached_result):
+                            cached_result = None
+                elif isinstance(litellm.cache.cache, RedisSemanticCache) or isinstance(
+                    litellm.cache.cache, RedisCache
+                ):
+                    preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
+                    kwargs["preset_cache_key"] = (
+                        preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
+                    )
+                    cached_result = await litellm.cache.async_get_cache(*args, **kwargs)
+                elif isinstance(litellm.cache.cache, QdrantSemanticCache):
+                    preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
+                    kwargs["preset_cache_key"] = (
+                        preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
+                    )
+                    cached_result = await litellm.cache.async_get_cache(*args, **kwargs)
+                else:  # for s3 caching. [NOT RECOMMENDED IN PROD - this will slow down responses since boto3 is sync]
+                    preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
+                    kwargs["preset_cache_key"] = (
+                        preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
+                    )
+                    cached_result = litellm.cache.get_cache(*args, **kwargs)
+                if cached_result is not None and not isinstance(cached_result, list):
+                    print_verbose("Cache Hit!")
+                    cache_hit = True
+                    end_time = datetime.datetime.now()
+                    (
+                        model,
+                        custom_llm_provider,
+                        dynamic_api_key,
+                        api_base,
+                    ) = litellm.get_llm_provider(
+                        model=model,
+                        custom_llm_provider=kwargs.get("custom_llm_provider", None),
+                        api_base=kwargs.get("api_base", None),
+                        api_key=kwargs.get("api_key", None),
+                    )
+                    print_verbose(
+                        f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
+                    )
+                    logging_obj.update_environment_variables(
+                        model=model,
+                        user=kwargs.get("user", None),
+                        optional_params={},
+                        litellm_params={
+                            "logger_fn": kwargs.get("logger_fn", None),
+                            "acompletion": True,
+                            "metadata": kwargs.get("metadata", {}),
+                            "model_info": kwargs.get("model_info", {}),
+                            "proxy_server_request": kwargs.get(
+                                "proxy_server_request", None
+                            ),
+                            "preset_cache_key": kwargs.get("preset_cache_key", None),
+                            "stream_response": kwargs.get("stream_response", {}),
+                            "api_base": kwargs.get("api_base", ""),
+                        },
+                        input=kwargs.get("messages", ""),
+                        api_key=kwargs.get("api_key", None),
+                        original_response=str(cached_result),
+                        additional_args=None,
+                        stream=kwargs.get("stream", False),
+                    )
+                    call_type = original_function.__name__
+                    if call_type == CallTypes.acompletion.value and isinstance(
+                        cached_result, dict
+                    ):
+                        if kwargs.get("stream", False) is True:
+                            cached_result = convert_to_streaming_response_async(
+                                response_object=cached_result,
+                            )
+                            cached_result = CustomStreamWrapper(
+                                completion_stream=cached_result,
+                                model=model,
+                                custom_llm_provider="cached_response",
+                                logging_obj=logging_obj,
+                            )
+                        else:
+                            cached_result = convert_to_model_response_object(
+                                response_object=cached_result,
+                                model_response_object=ModelResponse(),
+                            )
+                    if call_type == CallTypes.atext_completion.value and isinstance(
+                        cached_result, dict
+                    ):
+                        if kwargs.get("stream", False) is True:
+                            cached_result = convert_to_streaming_response_async(
+                                response_object=cached_result,
+                            )
+                            cached_result = CustomStreamWrapper(
+                                completion_stream=cached_result,
+                                model=model,
+                                custom_llm_provider="cached_response",
+                                logging_obj=logging_obj,
+                            )
+                        else:
+                            cached_result = TextCompletionResponse(**cached_result)
+                    elif call_type == CallTypes.aembedding.value and isinstance(
+                        cached_result, dict
+                    ):
+                        cached_result = convert_to_model_response_object(
+                            response_object=cached_result,
+                            model_response_object=EmbeddingResponse(),
+                            response_type="embedding",
+                        )
+                    elif call_type == CallTypes.arerank.value and isinstance(
+                        cached_result, dict
+                    ):
+                        cached_result = convert_to_model_response_object(
+                            response_object=cached_result,
+                            model_response_object=None,
+                            response_type="rerank",
+                        )
+                    elif call_type == CallTypes.atranscription.value and isinstance(
+                        cached_result, dict
+                    ):
+                        hidden_params = {
+                            "model": "whisper-1",
+                            "custom_llm_provider": custom_llm_provider,
+                            "cache_hit": True,
+                        }
+                        cached_result = convert_to_model_response_object(
+                            response_object=cached_result,
+                            model_response_object=TranscriptionResponse(),
+                            response_type="audio_transcription",
+                            hidden_params=hidden_params,
+                        )
+                    if kwargs.get("stream", False) is False:
+                        # LOG SUCCESS
+                        asyncio.create_task(
+                            logging_obj.async_success_handler(
+                                cached_result, start_time, end_time, cache_hit
+                            )
+                        )
+                        threading.Thread(
+                            target=logging_obj.success_handler,
+                            args=(cached_result, start_time, end_time, cache_hit),
+                        ).start()
+                    cache_key = kwargs.get("preset_cache_key", None)
+                    if (
+                        isinstance(cached_result, BaseModel)
+                        or isinstance(cached_result, CustomStreamWrapper)
+                    ) and hasattr(cached_result, "_hidden_params"):
+                        cached_result._hidden_params["cache_key"] = cache_key  # type: ignore
+                    return CachingHandlerResponse(cached_result=cached_result)
+                elif (
+                    call_type == CallTypes.aembedding.value
+                    and cached_result is not None
+                    and isinstance(cached_result, list)
+                    and litellm.cache is not None
+                    and not isinstance(
+                        litellm.cache.cache, S3Cache
+                    )  # s3 doesn't support bulk writing. Exclude.
+                ):
+                    remaining_list = []
+                    non_null_list = []
+                    for idx, cr in enumerate(cached_result):
+                        if cr is None:
+                            remaining_list.append(kwargs["input"][idx])
+                        else:
+                            non_null_list.append((idx, cr))
+                    original_kwargs_input = kwargs["input"]
+                    kwargs["input"] = remaining_list
+                    if len(non_null_list) > 0:
+                        print_verbose(f"EMBEDDING CACHE HIT! - {len(non_null_list)}")
+                        final_embedding_cached_response = EmbeddingResponse(
+                            model=kwargs.get("model"),
+                            data=[None] * len(original_kwargs_input),
+                        )
+                        final_embedding_cached_response._hidden_params["cache_hit"] = (
+                            True
+                        )
+
+                        for val in non_null_list:
+                            idx, cr = val  # (idx, cr) tuple
+                            if cr is not None:
+                                final_embedding_cached_response.data[idx] = Embedding(
+                                    embedding=cr["embedding"],
+                                    index=idx,
+                                    object="embedding",
+                                )
+                    if len(remaining_list) == 0:
+                        # LOG SUCCESS
+                        cache_hit = True
+                        end_time = datetime.datetime.now()
+                        (
+                            model,
+                            custom_llm_provider,
+                            dynamic_api_key,
+                            api_base,
+                        ) = litellm.get_llm_provider(
+                            model=model,
+                            custom_llm_provider=kwargs.get("custom_llm_provider", None),
+                            api_base=kwargs.get("api_base", None),
+                            api_key=kwargs.get("api_key", None),
+                        )
+                        print_verbose(
+                            f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
+                        )
+                        logging_obj.update_environment_variables(
+                            model=model,
+                            user=kwargs.get("user", None),
+                            optional_params={},
+                            litellm_params={
+                                "logger_fn": kwargs.get("logger_fn", None),
+                                "acompletion": True,
+                                "metadata": kwargs.get("metadata", {}),
+                                "model_info": kwargs.get("model_info", {}),
+                                "proxy_server_request": kwargs.get(
+                                    "proxy_server_request", None
+                                ),
+                                "preset_cache_key": kwargs.get(
+                                    "preset_cache_key", None
+                                ),
+                                "stream_response": kwargs.get("stream_response", {}),
+                                "api_base": "",
+                            },
+                            input=kwargs.get("messages", ""),
+                            api_key=kwargs.get("api_key", None),
+                            original_response=str(final_embedding_cached_response),
+                            additional_args=None,
+                            stream=kwargs.get("stream", False),
+                        )
+                        asyncio.create_task(
+                            logging_obj.async_success_handler(
+                                final_embedding_cached_response,
+                                start_time,
+                                end_time,
+                                cache_hit,
+                            )
+                        )
+                        threading.Thread(
+                            target=logging_obj.success_handler,
+                            args=(
+                                final_embedding_cached_response,
+                                start_time,
+                                end_time,
+                                cache_hit,
+                            ),
+                        ).start()
+                        return CachingHandlerResponse(
+                            final_embedding_cached_response=final_embedding_cached_response,
+                            embedding_all_elements_cache_hit=True,
+                        )
+        return CachingHandlerResponse(
+            cached_result=cached_result,
+            final_embedding_cached_response=final_embedding_cached_response,
+        )
+
+    async def _async_set_cache(
+        self,
+        result: Any,
+        original_function: Callable,
+        kwargs: Dict[str, Any],
+        args: Optional[Tuple[Any, ...]] = None,
+    ):
+        """
+        Internal method to check the type of the result & cache used and adds the result to the cache accordingly
+
+        Args:
+            result: Any:
+            original_function: Callable:
+            kwargs: Dict[str, Any]:
+            args: Optional[Tuple[Any, ...]] = None:
+
+        Returns:
+            None
+        Raises:
+            None
+        """
+        args = args or ()
+        # [OPTIONAL] ADD TO CACHE
+        if (
+            (litellm.cache is not None)
+            and litellm.cache.supported_call_types is not None
+            and (str(original_function.__name__) in litellm.cache.supported_call_types)
+            and (kwargs.get("cache", {}).get("no-store", False) is not True)
+        ):
+            if (
+                isinstance(result, litellm.ModelResponse)
+                or isinstance(result, litellm.EmbeddingResponse)
+                or isinstance(result, TranscriptionResponse)
+                or isinstance(result, RerankResponse)
+            ):
+                if (
+                    isinstance(result, EmbeddingResponse)
+                    and isinstance(kwargs["input"], list)
+                    and litellm.cache is not None
+                    and not isinstance(
+                        litellm.cache.cache, S3Cache
+                    )  # s3 doesn't support bulk writing. Exclude.
+                ):
+                    asyncio.create_task(
+                        litellm.cache.async_add_cache_pipeline(result, *args, **kwargs)
+                    )
+                elif isinstance(litellm.cache.cache, S3Cache):
+                    threading.Thread(
+                        target=litellm.cache.add_cache,
+                        args=(result,) + args,
+                        kwargs=kwargs,
+                    ).start()
+                else:
+                    asyncio.create_task(
+                        litellm.cache.async_add_cache(result.json(), *args, **kwargs)
+                    )
+            else:
+                asyncio.create_task(
+                    litellm.cache.async_add_cache(result, *args, **kwargs)
+                )
--- a/litellm/deprecated_litellm_server/server_utils.py
+++ b/litellm/deprecated_litellm_server/server_utils.py
@ -43,7 +43,7 @@
 #     ### REDIS
 #     # if len(os.getenv("REDIS_HOST", "")) >  0 and len(os.getenv("REDIS_PORT", "")) > 0 and len(os.getenv("REDIS_PASSWORD", "")) > 0:
 #     #     print(f"redis host: {os.getenv('REDIS_HOST')}; redis port: {os.getenv('REDIS_PORT')}; password: {os.getenv('REDIS_PASSWORD')}")
-#     #     from litellm.caching import Cache
+#     #     from litellm.caching.caching import Cache
 #     #     litellm.cache = Cache(type="redis", host=os.getenv("REDIS_HOST"), port=os.getenv("REDIS_PORT"), password=os.getenv("REDIS_PASSWORD"))
 #     #     print("\033[92mLiteLLM: Switched on Redis caching\033[0m")

--- a/litellm/integrations/SlackAlerting/slack_alerting.py
+++ b/litellm/integrations/SlackAlerting/slack_alerting.py
@ -21,7 +21,7 @@ import litellm.litellm_core_utils
 import litellm.litellm_core_utils.litellm_logging
 import litellm.types
 from litellm._logging import verbose_logger, verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_batch_logger import CustomBatchLogger
 from litellm.litellm_core_utils.exception_mapping_utils import (
    _add_key_name_and_team_to_alert,
--- a/litellm/integrations/clickhouse.py
+++ b/litellm/integrations/clickhouse.py
@ -13,7 +13,7 @@ import requests

 import litellm
 from litellm._logging import verbose_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.types.utils import StandardLoggingPayload

@ -29,14 +29,30 @@ def create_client():
        clickhouse_host = os.getenv("CLICKHOUSE_HOST")
        if clickhouse_host is not None:
            verbose_logger.debug("setting up clickhouse")
+
+            port = os.getenv("CLICKHOUSE_PORT")
            if port is not None and isinstance(port, str):
                port = int(port)

+            host: Optional[str] = os.getenv("CLICKHOUSE_HOST")
+            if host is None:
+                raise ValueError("CLICKHOUSE_HOST is not set")
+
+            username: Optional[str] = os.getenv("CLICKHOUSE_USERNAME")
+            if username is None:
+                raise ValueError("CLICKHOUSE_USERNAME is not set")
+
+            password: Optional[str] = os.getenv("CLICKHOUSE_PASSWORD")
+            if password is None:
+                raise ValueError("CLICKHOUSE_PASSWORD is not set")
+            if port is None:
+                raise ValueError("CLICKHOUSE_PORT is not set")
+
            client = clickhouse_connect.get_client(
-                host=os.getenv("CLICKHOUSE_HOST"),
+                host=host,
                port=port,
-                username=os.getenv("CLICKHOUSE_USERNAME"),
-                password=os.getenv("CLICKHOUSE_PASSWORD"),
+                username=username,
+                password=password,
            )
            return client
        else:
@ -176,11 +192,29 @@ def _start_clickhouse():
        if port is not None and isinstance(port, str):
            port = int(port)

+        port = os.getenv("CLICKHOUSE_PORT")
+        if port is not None and isinstance(port, str):
+            port = int(port)
+
+        host: Optional[str] = os.getenv("CLICKHOUSE_HOST")
+        if host is None:
+            raise ValueError("CLICKHOUSE_HOST is not set")
+
+        username: Optional[str] = os.getenv("CLICKHOUSE_USERNAME")
+        if username is None:
+            raise ValueError("CLICKHOUSE_USERNAME is not set")
+
+        password: Optional[str] = os.getenv("CLICKHOUSE_PASSWORD")
+        if password is None:
+            raise ValueError("CLICKHOUSE_PASSWORD is not set")
+        if port is None:
+            raise ValueError("CLICKHOUSE_PORT is not set")
+
        client = clickhouse_connect.get_client(
-            host=os.getenv("CLICKHOUSE_HOST"),
+            host=host,
            port=port,
-            username=os.getenv("CLICKHOUSE_USERNAME"),
-            password=os.getenv("CLICKHOUSE_PASSWORD"),
+            username=username,
+            password=password,
        )
        # view all tables in DB
        response = client.query("SHOW TABLES")
@ -241,11 +275,25 @@ class ClickhouseLogger:
        if port is not None and isinstance(port, str):
            port = int(port)

+        host: Optional[str] = os.getenv("CLICKHOUSE_HOST")
+        if host is None:
+            raise ValueError("CLICKHOUSE_HOST is not set")
+
+        username: Optional[str] = os.getenv("CLICKHOUSE_USERNAME")
+        if username is None:
+            raise ValueError("CLICKHOUSE_USERNAME is not set")
+
+        password: Optional[str] = os.getenv("CLICKHOUSE_PASSWORD")
+        if password is None:
+            raise ValueError("CLICKHOUSE_PASSWORD is not set")
+        if port is None:
+            raise ValueError("CLICKHOUSE_PORT is not set")
+
        client = clickhouse_connect.get_client(
-            host=os.getenv("CLICKHOUSE_HOST"),
+            host=host,
            port=port,
-            username=os.getenv("CLICKHOUSE_USERNAME"),
-            password=os.getenv("CLICKHOUSE_PASSWORD"),
+            username=username,
+            password=password,
        )
        self.client = client

--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -8,7 +8,7 @@ from typing import Any, Literal, Optional, Tuple, Union
 import dotenv
 from pydantic import BaseModel

-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.types.llms.openai import ChatCompletionRequest
 from litellm.types.services import ServiceLoggerPayload
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -23,7 +23,7 @@ from litellm import (
    turn_off_message_logging,
    verbose_logger,
 )
-from litellm.caching import DualCache, InMemoryCache, S3Cache
+from litellm.caching.caching import DualCache, InMemoryCache, S3Cache
 from litellm.cost_calculator import _select_model_name_for_cost_calc
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.integrations.custom_logger import CustomLogger
--- a/litellm/llms/AzureOpenAI/azure.py
+++ b/litellm/llms/AzureOpenAI/azure.py
@ -10,7 +10,7 @@ from openai import AsyncAzureOpenAI, AzureOpenAI
 from typing_extensions import overload

 import litellm
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.types.utils import EmbeddingResponse
--- a/litellm/llms/base_aws_llm.py
+++ b/litellm/llms/base_aws_llm.py
@ -6,7 +6,7 @@ from typing import Dict, List, Optional, Tuple
 import httpx

 from litellm._logging import verbose_logger
-from litellm.caching import DualCache, InMemoryCache
+from litellm.caching.caching import DualCache, InMemoryCache
 from litellm.secret_managers.main import get_secret

 from .base import BaseLLM
--- a/litellm/llms/bedrock/chat/invoke_handler.py
+++ b/litellm/llms/bedrock/chat/invoke_handler.py
@ -29,7 +29,7 @@ import requests  # type: ignore

 import litellm
 from litellm import verbose_logger
-from litellm.caching import InMemoryCache
+from litellm.caching.caching import InMemoryCache
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.litellm_core_utils.litellm_logging import Logging
 from litellm.llms.custom_httpx.http_handler import (
--- a/litellm/llms/prompt_templates/image_handling.py
+++ b/litellm/llms/prompt_templates/image_handling.py
@ -8,7 +8,7 @@ from httpx import Response

 import litellm
 from litellm import verbose_logger
-from litellm.caching import InMemoryCache
+from litellm.caching.caching import InMemoryCache
 from litellm.llms.custom_httpx.http_handler import (
    _get_httpx_client,
    get_async_httpx_client,
--- a/litellm/llms/vertex_ai_and_google_ai_studio/context_caching/vertex_ai_context_caching.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/context_caching/vertex_ai_context_caching.py
@ -4,7 +4,7 @@ from typing import Callable, List, Literal, Optional, Tuple, Union
 import httpx

 import litellm
-from litellm.caching import Cache
+from litellm.caching.caching import Cache
 from litellm.litellm_core_utils.litellm_logging import Logging
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.llms.OpenAI.openai import AllMessageValues
--- a/litellm/main.py
+++ b/litellm/main.py
@ -65,7 +65,7 @@ from litellm.utils import (
 )

 from ._logging import verbose_logger
-from .caching import disable_cache, enable_cache, update_cache
+from .caching.caching import disable_cache, enable_cache, update_cache
 from .llms import (
    aleph_alpha,
    baseten,
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@ -16,7 +16,7 @@ from pydantic import BaseModel

 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import (
    LiteLLM_EndUserTable,
    LiteLLM_JWTAuth,
--- a/litellm/proxy/auth/handle_jwt.py
+++ b/litellm/proxy/auth/handle_jwt.py
@ -15,7 +15,7 @@ from cryptography.hazmat.backends import default_backend
 from cryptography.hazmat.primitives import serialization

 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
 from litellm.proxy._types import LiteLLM_JWTAuth, LiteLLM_UserTable
 from litellm.proxy.utils import PrismaClient
--- a/litellm/proxy/caching_routes.py
+++ b/litellm/proxy/caching_routes.py
@ -5,7 +5,7 @@ from fastapi import APIRouter, Depends, HTTPException, Request

 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import RedisCache
+from litellm.caching.caching import RedisCache
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth

 router = APIRouter(
--- a/litellm/proxy/example_config_yaml/custom_guardrail.py
+++ b/litellm/proxy/example_config_yaml/custom_guardrail.py
@ -2,7 +2,7 @@ from typing import Any, Dict, List, Literal, Optional, Union

 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.guardrails.guardrail_helpers import should_proceed_based_on_metadata
--- a/litellm/proxy/guardrails/guardrail_hooks/aporia_ai.py
+++ b/litellm/proxy/guardrails/guardrail_hooks/aporia_ai.py
@ -25,7 +25,7 @@ from fastapi import HTTPException

 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.litellm_core_utils.logging_utils import (
    convert_litellm_response_object_to_str,
--- a/litellm/proxy/guardrails/guardrail_hooks/bedrock_guardrails.py
+++ b/litellm/proxy/guardrails/guardrail_hooks/bedrock_guardrails.py
@ -25,7 +25,7 @@ from fastapi import HTTPException

 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.litellm_core_utils.logging_utils import (
    convert_litellm_response_object_to_str,
--- a/litellm/proxy/guardrails/guardrail_hooks/custom_guardrail.py
+++ b/litellm/proxy/guardrails/guardrail_hooks/custom_guardrail.py
@ -2,7 +2,7 @@ from typing import Any, Dict, List, Literal, Optional, Union

 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.guardrails.guardrail_helpers import should_proceed_based_on_metadata
--- a/litellm/proxy/guardrails/guardrail_hooks/presidio.py
+++ b/litellm/proxy/guardrails/guardrail_hooks/presidio.py
@ -21,7 +21,7 @@ from pydantic import BaseModel
 import litellm  # noqa: E401
 from litellm import get_secret
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.utils import (
--- a/litellm/proxy/health_endpoints/_health_endpoints.py
+++ b/litellm/proxy/health_endpoints/_health_endpoints.py
@ -465,7 +465,7 @@ async def health_readiness():
        # check Cache
        cache_type = None
        if litellm.cache is not None:
-            from litellm.caching import RedisSemanticCache
+            from litellm.caching.caching import RedisSemanticCache

            cache_type = litellm.cache.type

--- a/litellm/proxy/hooks/azure_content_safety.py
+++ b/litellm/proxy/hooks/azure_content_safety.py
@ -7,7 +7,7 @@ from fastapi import HTTPException

 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy._types import UserAPIKeyAuth

--- a/litellm/proxy/hooks/batch_redis_get.py
+++ b/litellm/proxy/hooks/batch_redis_get.py
@ -11,7 +11,7 @@ from fastapi import HTTPException

 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache, InMemoryCache, RedisCache
+from litellm.caching.caching import DualCache, InMemoryCache, RedisCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy._types import UserAPIKeyAuth

--- a/litellm/proxy/hooks/cache_control_check.py
+++ b/litellm/proxy/hooks/cache_control_check.py
@ -7,7 +7,7 @@ from fastapi import HTTPException

 import litellm
 from litellm import verbose_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy._types import UserAPIKeyAuth

--- a/litellm/proxy/hooks/dynamic_rate_limiter.py
+++ b/litellm/proxy/hooks/dynamic_rate_limiter.py
@ -14,7 +14,7 @@ from fastapi import HTTPException
 import litellm
 from litellm import ModelResponse, Router
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.types.router import ModelGroupInfo
--- a/litellm/proxy/hooks/max_budget_limiter.py
+++ b/litellm/proxy/hooks/max_budget_limiter.py
@ -4,7 +4,7 @@ from fastapi import HTTPException

 import litellm
 from litellm import verbose_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy._types import UserAPIKeyAuth

--- a/litellm/proxy/hooks/parallel_request_limiter.py
+++ b/litellm/proxy/hooks/parallel_request_limiter.py
@ -9,7 +9,7 @@ from pydantic import BaseModel
 import litellm
 from litellm import ModelResponse
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
 from litellm.proxy._types import CurrentItemRateLimit, UserAPIKeyAuth
--- a/litellm/proxy/hooks/presidio_pii_masking.py
+++ b/litellm/proxy/hooks/presidio_pii_masking.py
@ -19,7 +19,7 @@ from fastapi import HTTPException

 import litellm  # noqa: E401
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.utils import (
--- a/litellm/proxy/hooks/prompt_injection_detection.py
+++ b/litellm/proxy/hooks/prompt_injection_detection.py
@ -18,7 +18,7 @@ from typing_extensions import overload

 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.llms.prompt_templates.factory import prompt_injection_detection_default_pt
 from litellm.proxy._types import LiteLLMPromptInjectionParams, UserAPIKeyAuth
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -112,7 +112,7 @@ from litellm import (
    RetrieveBatchRequest,
 )
 from litellm._logging import verbose_proxy_logger, verbose_router_logger
-from litellm.caching import DualCache, RedisCache
+from litellm.caching.caching import DualCache, RedisCache
 from litellm.exceptions import RejectedRequestError
 from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting
 from litellm.litellm_core_utils.core_helpers import (
@ -1554,7 +1554,7 @@ class ProxyConfig:
            for key, value in litellm_settings.items():
                if key == "cache" and value is True:
                    print(f"{blue_color_code}\nSetting Cache on Proxy")  # noqa
-                    from litellm.caching import Cache
+                    from litellm.caching.caching import Cache

                    cache_params = {}
                    if "cache_params" in litellm_settings:
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -49,7 +49,7 @@ from litellm import (
 )
 from litellm._logging import verbose_proxy_logger
 from litellm._service_logger import ServiceLogging, ServiceTypes
-from litellm.caching import DualCache, RedisCache
+from litellm.caching.caching import DualCache, RedisCache
 from litellm.exceptions import RejectedRequestError
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.integrations.custom_logger import CustomLogger
--- a/litellm/router.py
+++ b/litellm/router.py
@ -37,7 +37,7 @@ import litellm.litellm_core_utils.exception_mapping_utils
 from litellm import get_secret_str
 from litellm._logging import verbose_router_logger
 from litellm.assistants.main import AssistantDeleted
-from litellm.caching import DualCache, InMemoryCache, RedisCache
+from litellm.caching.caching import DualCache, InMemoryCache, RedisCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging
 from litellm.llms.AzureOpenAI.azure import get_azure_ad_token_from_oidc
--- a/litellm/router_strategy/least_busy.py
+++ b/litellm/router_strategy/least_busy.py
@ -14,7 +14,7 @@ from typing import Optional
 import dotenv  # type: ignore
 import requests

-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger


--- a/litellm/router_strategy/lowest_cost.py
+++ b/litellm/router_strategy/lowest_cost.py
@ -9,7 +9,7 @@ from pydantic import BaseModel
 import litellm
 from litellm import ModelResponse, token_counter, verbose_logger
 from litellm._logging import verbose_router_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger


--- a/litellm/router_strategy/lowest_latency.py
+++ b/litellm/router_strategy/lowest_latency.py
@ -9,7 +9,7 @@ from pydantic import BaseModel

 import litellm
 from litellm import ModelResponse, token_counter, verbose_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger


--- a/litellm/router_strategy/lowest_tpm_rpm.py
+++ b/litellm/router_strategy/lowest_tpm_rpm.py
@ -12,7 +12,7 @@ from pydantic import BaseModel

 from litellm import token_counter
 from litellm._logging import verbose_router_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.utils import print_verbose

--- a/litellm/router_strategy/lowest_tpm_rpm_v2.py
+++ b/litellm/router_strategy/lowest_tpm_rpm_v2.py
@ -10,7 +10,7 @@ from pydantic import BaseModel
 import litellm
 from litellm import token_counter
 from litellm._logging import verbose_logger, verbose_router_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.types.router import RouterErrors
 from litellm.utils import get_utc_datetime, print_verbose
--- a/litellm/router_utils/cooldown_cache.py
+++ b/litellm/router_utils/cooldown_cache.py
@ -7,7 +7,7 @@ import time
 from typing import List, Optional, Tuple, TypedDict

 from litellm import verbose_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache


 class CooldownCacheValue(TypedDict):
--- a/litellm/scheduler.py
+++ b/litellm/scheduler.py
@ -1,9 +1,11 @@
-import heapq
-from pydantic import BaseModel
-from typing import Optional
 import enum
-from litellm.caching import DualCache, RedisCache
+import heapq
+from typing import Optional
+
+from pydantic import BaseModel
+
 from litellm import print_verbose
+from litellm.caching.caching import DualCache, RedisCache


 class SchedulerCacheKeys(enum.Enum):
--- a/litellm/secret_managers/google_secret_manager.py
+++ b/litellm/secret_managers/google_secret_manager.py
@ -4,7 +4,7 @@ from typing import Optional

 import litellm
 from litellm._logging import verbose_logger
-from litellm.caching import InMemoryCache
+from litellm.caching.caching import InMemoryCache
 from litellm.integrations.gcs_bucket.gcs_bucket_base import GCSBucketBase
 from litellm.llms.custom_httpx.http_handler import _get_httpx_client
 from litellm.proxy._types import CommonProxyErrors, KeyManagementSystem
--- a/litellm/secret_managers/main.py
+++ b/litellm/secret_managers/main.py
@ -12,7 +12,7 @@ from dotenv import load_dotenv

 import litellm
 from litellm._logging import print_verbose, verbose_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.proxy._types import KeyManagementSystem

--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -56,7 +56,10 @@ import litellm._service_logger  # for storing API inputs, outputs, and metadata
 import litellm.litellm_core_utils
 import litellm.litellm_core_utils.audio_utils.utils
 import litellm.litellm_core_utils.json_validation_rule
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
+from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler
+
+_llm_caching_handler = LLMCachingHandler()
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.litellm_core_utils.exception_mapping_utils import (
@ -146,7 +149,13 @@ from typing import (
 from openai import OpenAIError as OriginalError

 from ._logging import verbose_logger
-from .caching import Cache, QdrantSemanticCache, RedisCache, RedisSemanticCache, S3Cache
+from .caching.caching import (
+    Cache,
+    QdrantSemanticCache,
+    RedisCache,
+    RedisSemanticCache,
+    S3Cache,
+)
 from .exceptions import (
    APIConnectionError,
    APIError,
@ -1121,299 +1130,26 @@ def client(original_function):
            print_verbose(
                f"ASYNC kwargs[caching]: {kwargs.get('caching', False)}; litellm.cache: {litellm.cache}; kwargs.get('cache'): {kwargs.get('cache', None)}"
            )
-            # if caching is false, don't run this
-            final_embedding_cached_response = None
-
-            if (
-                (kwargs.get("caching", None) is None and litellm.cache is not None)
-                or kwargs.get("caching", False) is True
-            ) and (
-                kwargs.get("cache", {}).get("no-cache", False) is not True
-            ):  # allow users to control returning cached responses from the completion function
-                # checking cache
-                print_verbose("INSIDE CHECKING CACHE")
-                if (
-                    litellm.cache is not None
-                    and litellm.cache.supported_call_types is not None
-                    and str(original_function.__name__)
-                    in litellm.cache.supported_call_types
-                ):
-                    print_verbose("Checking Cache")
-                    if call_type == CallTypes.aembedding.value and isinstance(
-                        kwargs["input"], list
-                    ):
-                        tasks = []
-                        for idx, i in enumerate(kwargs["input"]):
-                            preset_cache_key = litellm.cache.get_cache_key(
-                                *args, **{**kwargs, "input": i}
-                            )
-                            tasks.append(
-                                litellm.cache.async_get_cache(
-                                    cache_key=preset_cache_key
-                                )
-                            )
-                        cached_result = await asyncio.gather(*tasks)
-                        ## check if cached result is None ##
-                        if cached_result is not None and isinstance(
-                            cached_result, list
-                        ):
-                            if len(cached_result) == 1 and cached_result[0] is None:
-                                cached_result = None
-                    elif isinstance(
-                        litellm.cache.cache, RedisSemanticCache
-                    ) or isinstance(litellm.cache.cache, RedisCache):
-                        preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
-                        kwargs["preset_cache_key"] = (
-                            preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
-                        )
-                        cached_result = await litellm.cache.async_get_cache(
-                            *args, **kwargs
-                        )
-                    elif isinstance(litellm.cache.cache, QdrantSemanticCache):
-                        preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
-                        kwargs["preset_cache_key"] = (
-                            preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
-                        )
-                        cached_result = await litellm.cache.async_get_cache(
-                            *args, **kwargs
-                        )
-                    else:  # for s3 caching. [NOT RECOMMENDED IN PROD - this will slow down responses since boto3 is sync]
-                        preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
-                        kwargs["preset_cache_key"] = (
-                            preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
-                        )
-                        cached_result = litellm.cache.get_cache(*args, **kwargs)
-                    if cached_result is not None and not isinstance(
-                        cached_result, list
-                    ):
-                        print_verbose("Cache Hit!", log_level="INFO")
-                        cache_hit = True
-                        end_time = datetime.datetime.now()
-                        (
-                            model,
-                            custom_llm_provider,
-                            dynamic_api_key,
-                            api_base,
-                        ) = litellm.get_llm_provider(
+            _caching_handler_response: CachingHandlerResponse = (
+                await _llm_caching_handler._async_get_cache(
                    model=model,
-                            custom_llm_provider=kwargs.get("custom_llm_provider", None),
-                            api_base=kwargs.get("api_base", None),
-                            api_key=kwargs.get("api_key", None),
-                        )
-                        print_verbose(
-                            f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
-                        )
-                        logging_obj.update_environment_variables(
-                            model=model,
-                            user=kwargs.get("user", None),
-                            optional_params={},
-                            litellm_params={
-                                "logger_fn": kwargs.get("logger_fn", None),
-                                "acompletion": True,
-                                "metadata": kwargs.get("metadata", {}),
-                                "model_info": kwargs.get("model_info", {}),
-                                "proxy_server_request": kwargs.get(
-                                    "proxy_server_request", None
-                                ),
-                                "preset_cache_key": kwargs.get(
-                                    "preset_cache_key", None
-                                ),
-                                "stream_response": kwargs.get("stream_response", {}),
-                                "api_base": kwargs.get("api_base", ""),
-                            },
-                            input=kwargs.get("messages", ""),
-                            api_key=kwargs.get("api_key", None),
-                            original_response=str(cached_result),
-                            additional_args=None,
-                            stream=kwargs.get("stream", False),
-                        )
-                        call_type = original_function.__name__
-                        if call_type == CallTypes.acompletion.value and isinstance(
-                            cached_result, dict
-                        ):
-                            if kwargs.get("stream", False) is True:
-                                cached_result = convert_to_streaming_response_async(
-                                    response_object=cached_result,
-                                )
-                                cached_result = CustomStreamWrapper(
-                                    completion_stream=cached_result,
-                                    model=model,
-                                    custom_llm_provider="cached_response",
+                    original_function=original_function,
                    logging_obj=logging_obj,
+                    start_time=start_time,
+                    call_type=call_type,
+                    kwargs=kwargs,
+                    args=args,
                )
-                            else:
-                                cached_result = convert_to_model_response_object(
-                                    response_object=cached_result,
-                                    model_response_object=ModelResponse(),
            )
            if (
-                            call_type == CallTypes.atext_completion.value
-                            and isinstance(cached_result, dict)
+                _caching_handler_response.cached_result is not None
+                and _caching_handler_response.final_embedding_cached_response is None
            ):
-                            if kwargs.get("stream", False) is True:
-                                cached_result = convert_to_streaming_response_async(
-                                    response_object=cached_result,
-                                )
-                                cached_result = CustomStreamWrapper(
-                                    completion_stream=cached_result,
-                                    model=model,
-                                    custom_llm_provider="cached_response",
-                                    logging_obj=logging_obj,
-                                )
-                            else:
-                                cached_result = TextCompletionResponse(**cached_result)
-                        elif call_type == CallTypes.aembedding.value and isinstance(
-                            cached_result, dict
-                        ):
-                            cached_result = convert_to_model_response_object(
-                                response_object=cached_result,
-                                model_response_object=EmbeddingResponse(),
-                                response_type="embedding",
-                            )
-                        elif call_type == CallTypes.arerank.value and isinstance(
-                            cached_result, dict
-                        ):
-                            cached_result = convert_to_model_response_object(
-                                response_object=cached_result,
-                                model_response_object=None,
-                                response_type="rerank",
-                            )
-                        elif call_type == CallTypes.atranscription.value and isinstance(
-                            cached_result, dict
-                        ):
-                            hidden_params = {
-                                "model": "whisper-1",
-                                "custom_llm_provider": custom_llm_provider,
-                                "cache_hit": True,
-                            }
-                            cached_result = convert_to_model_response_object(
-                                response_object=cached_result,
-                                model_response_object=TranscriptionResponse(),
-                                response_type="audio_transcription",
-                                hidden_params=hidden_params,
-                            )
-                        if kwargs.get("stream", False) is False:
-                            # LOG SUCCESS
-                            asyncio.create_task(
-                                logging_obj.async_success_handler(
-                                    cached_result, start_time, end_time, cache_hit
-                                )
-                            )
-                            threading.Thread(
-                                target=logging_obj.success_handler,
-                                args=(cached_result, start_time, end_time, cache_hit),
-                            ).start()
-                        cache_key = kwargs.get("preset_cache_key", None)
-                        if (
-                            isinstance(cached_result, BaseModel)
-                            or isinstance(cached_result, CustomStreamWrapper)
-                        ) and hasattr(cached_result, "_hidden_params"):
-                            cached_result._hidden_params["cache_key"] = cache_key  # type: ignore
-                        return cached_result
-                    elif (
-                        call_type == CallTypes.aembedding.value
-                        and cached_result is not None
-                        and isinstance(cached_result, list)
-                        and litellm.cache is not None
-                        and not isinstance(
-                            litellm.cache.cache, S3Cache
-                        )  # s3 doesn't support bulk writing. Exclude.
-                    ):
-                        remaining_list = []
-                        non_null_list = []
-                        for idx, cr in enumerate(cached_result):
-                            if cr is None:
-                                remaining_list.append(kwargs["input"][idx])
-                            else:
-                                non_null_list.append((idx, cr))
-                        original_kwargs_input = kwargs["input"]
-                        kwargs["input"] = remaining_list
-                        if len(non_null_list) > 0:
-                            print_verbose(
-                                f"EMBEDDING CACHE HIT! - {len(non_null_list)}"
-                            )
-                            final_embedding_cached_response = EmbeddingResponse(
-                                model=kwargs.get("model"),
-                                data=[None] * len(original_kwargs_input),
-                            )
-                            final_embedding_cached_response._hidden_params[
-                                "cache_hit"
-                            ] = True
+                return _caching_handler_response.cached_result
+
+            elif _caching_handler_response.embedding_all_elements_cache_hit is True:
+                return _caching_handler_response.final_embedding_cached_response

-                            for val in non_null_list:
-                                idx, cr = val  # (idx, cr) tuple
-                                if cr is not None:
-                                    final_embedding_cached_response.data[idx] = (
-                                        Embedding(
-                                            embedding=cr["embedding"],
-                                            index=idx,
-                                            object="embedding",
-                                        )
-                                    )
-                        if len(remaining_list) == 0:
-                            # LOG SUCCESS
-                            cache_hit = True
-                            end_time = datetime.datetime.now()
-                            (
-                                model,
-                                custom_llm_provider,
-                                dynamic_api_key,
-                                api_base,
-                            ) = litellm.get_llm_provider(
-                                model=model,
-                                custom_llm_provider=kwargs.get(
-                                    "custom_llm_provider", None
-                                ),
-                                api_base=kwargs.get("api_base", None),
-                                api_key=kwargs.get("api_key", None),
-                            )
-                            print_verbose(
-                                f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
-                            )
-                            logging_obj.update_environment_variables(
-                                model=model,
-                                user=kwargs.get("user", None),
-                                optional_params={},
-                                litellm_params={
-                                    "logger_fn": kwargs.get("logger_fn", None),
-                                    "acompletion": True,
-                                    "metadata": kwargs.get("metadata", {}),
-                                    "model_info": kwargs.get("model_info", {}),
-                                    "proxy_server_request": kwargs.get(
-                                        "proxy_server_request", None
-                                    ),
-                                    "preset_cache_key": kwargs.get(
-                                        "preset_cache_key", None
-                                    ),
-                                    "stream_response": kwargs.get(
-                                        "stream_response", {}
-                                    ),
-                                    "api_base": "",
-                                },
-                                input=kwargs.get("messages", ""),
-                                api_key=kwargs.get("api_key", None),
-                                original_response=str(final_embedding_cached_response),
-                                additional_args=None,
-                                stream=kwargs.get("stream", False),
-                            )
-                            asyncio.create_task(
-                                logging_obj.async_success_handler(
-                                    final_embedding_cached_response,
-                                    start_time,
-                                    end_time,
-                                    cache_hit,
-                                )
-                            )
-                            threading.Thread(
-                                target=logging_obj.success_handler,
-                                args=(
-                                    final_embedding_cached_response,
-                                    start_time,
-                                    end_time,
-                                    cache_hit,
-                                ),
-                            ).start()
-                            return final_embedding_cached_response
            # MODEL CALL
            result = await original_function(*args, **kwargs)
            end_time = datetime.datetime.now()
@ -1467,51 +1203,14 @@ def client(original_function):
                original_response=result, model=model, optional_params=kwargs
            )

-            # [OPTIONAL] ADD TO CACHE
-            if (
-                (litellm.cache is not None)
-                and litellm.cache.supported_call_types is not None
-                and (
-                    str(original_function.__name__)
-                    in litellm.cache.supported_call_types
-                )
-                and (kwargs.get("cache", {}).get("no-store", False) is not True)
-            ):
-                if (
-                    isinstance(result, litellm.ModelResponse)
-                    or isinstance(result, litellm.EmbeddingResponse)
-                    or isinstance(result, TranscriptionResponse)
-                    or isinstance(result, RerankResponse)
-                ):
-                    if (
-                        isinstance(result, EmbeddingResponse)
-                        and isinstance(kwargs["input"], list)
-                        and litellm.cache is not None
-                        and not isinstance(
-                            litellm.cache.cache, S3Cache
-                        )  # s3 doesn't support bulk writing. Exclude.
-                    ):
-                        asyncio.create_task(
-                            litellm.cache.async_add_cache_pipeline(
-                                result, *args, **kwargs
-                            )
-                        )
-                    elif isinstance(litellm.cache.cache, S3Cache):
-                        threading.Thread(
-                            target=litellm.cache.add_cache,
-                            args=(result,) + args,
+            ## Add response to cache
+            await _llm_caching_handler._async_set_cache(
+                result=result,
+                original_function=original_function,
                kwargs=kwargs,
-                        ).start()
-                    else:
-                        asyncio.create_task(
-                            litellm.cache.async_add_cache(
-                                result.json(), *args, **kwargs
-                            )
-                        )
-                else:
-                    asyncio.create_task(
-                        litellm.cache.async_add_cache(result, *args, **kwargs)
+                args=args,
            )
+
            # LOG SUCCESS - handle streaming success logging in the _next_ object
            print_verbose(
                f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
@ -1528,24 +1227,32 @@ def client(original_function):
            # REBUILD EMBEDDING CACHING
            if (
                isinstance(result, EmbeddingResponse)
-                and final_embedding_cached_response is not None
-                and final_embedding_cached_response.data is not None
+                and _caching_handler_response.final_embedding_cached_response
+                is not None
+                and _caching_handler_response.final_embedding_cached_response.data
+                is not None
            ):
                idx = 0
                final_data_list = []
-                for item in final_embedding_cached_response.data:
+                for (
+                    item
+                ) in _caching_handler_response.final_embedding_cached_response.data:
                    if item is None and result.data is not None:
                        final_data_list.append(result.data[idx])
                        idx += 1
                    else:
                        final_data_list.append(item)

-                final_embedding_cached_response.data = final_data_list
-                final_embedding_cached_response._hidden_params["cache_hit"] = True
-                final_embedding_cached_response._response_ms = (
+                _caching_handler_response.final_embedding_cached_response.data = (
+                    final_data_list
+                )
+                _caching_handler_response.final_embedding_cached_response._hidden_params[
+                    "cache_hit"
+                ] = True
+                _caching_handler_response.final_embedding_cached_response._response_ms = (
                    end_time - start_time
                ).total_seconds() * 1000
-                return final_embedding_cached_response
+                return _caching_handler_response.final_embedding_cached_response

            return result
        except Exception as e:
--- a/tests/local_testing/test_add_update_models.py
+++ b/tests/local_testing/test_add_update_models.py
@ -19,7 +19,7 @@ from litellm._logging import verbose_proxy_logger
 from litellm.proxy.utils import PrismaClient, ProxyLogging

 verbose_proxy_logger.setLevel(level=logging.DEBUG)
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.router import (
    Deployment,
    updateDeployment,
--- a/tests/local_testing/test_alerting.py
+++ b/tests/local_testing/test_alerting.py
@ -28,7 +28,7 @@ import pytest
 from openai import APIError

 import litellm
-from litellm.caching import DualCache, RedisCache
+from litellm.caching.caching import DualCache, RedisCache
 from litellm.integrations.SlackAlerting.slack_alerting import (
    DeploymentMetrics,
    SlackAlerting,
--- a/tests/local_testing/test_auth_checks.py
+++ b/tests/local_testing/test_auth_checks.py
@ -13,7 +13,7 @@ sys.path.insert(
 )  # Adds the parent directory to the system path
 import pytest, litellm
 from litellm.proxy.auth.auth_checks import get_end_user_object
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import LiteLLM_EndUserTable, LiteLLM_BudgetTable
 from litellm.proxy.utils import PrismaClient

--- a/tests/local_testing/test_azure_content_safety.py
+++ b/tests/local_testing/test_azure_content_safety.py
@ -21,7 +21,7 @@ import pytest

 import litellm
 from litellm import Router, mock_completion
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.utils import ProxyLogging

--- a/tests/local_testing/test_banned_keyword_list.py
+++ b/tests/local_testing/test_banned_keyword_list.py
@ -21,7 +21,7 @@ from litellm.proxy.enterprise.enterprise_hooks.banned_keywords import (
 from litellm import Router, mock_completion
 from litellm.proxy.utils import ProxyLogging, hash_token
 from litellm.proxy._types import UserAPIKeyAuth
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache


@pytest.mark.asyncio
--- a/tests/local_testing/test_blocked_user_list.py
+++ b/tests/local_testing/test_blocked_user_list.py
@ -27,7 +27,7 @@ import pytest
 import litellm
 from litellm import Router, mock_completion
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.enterprise.enterprise_hooks.blocked_user_list import (
    _ENTERPRISE_BlockedUserList,
@ -56,7 +56,7 @@ verbose_proxy_logger.setLevel(level=logging.DEBUG)

 from starlette.datastructures import URL

-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import (
    BlockUsers,
    DynamoDBArgs,
--- a/tests/local_testing/test_caching.py
+++ b/tests/local_testing/test_caching.py
@ -21,7 +21,7 @@ import pytest

 import litellm
 from litellm import aembedding, completion, embedding
-from litellm.caching import Cache
+from litellm.caching.caching import Cache

 from unittest.mock import AsyncMock, patch, MagicMock
 import datetime
@ -52,7 +52,7 @@ async def test_dual_cache_async_batch_get_cache():
    - hit redis for the other -> expect to return None
    - expect result = [in_memory_result, None]
    """
-    from litellm.caching import DualCache, InMemoryCache, RedisCache
+    from litellm.caching.caching import DualCache, InMemoryCache, RedisCache

    in_memory_cache = InMemoryCache()
    redis_cache = RedisCache()  # get credentials from environment
@ -74,7 +74,7 @@ def test_dual_cache_batch_get_cache():
    - hit redis for the other -> expect to return None
    - expect result = [in_memory_result, None]
    """
-    from litellm.caching import DualCache, InMemoryCache, RedisCache
+    from litellm.caching.caching import DualCache, InMemoryCache, RedisCache

    in_memory_cache = InMemoryCache()
    redis_cache = RedisCache()  # get credentials from environment
@ -520,6 +520,7 @@ async def test_embedding_caching_azure_individual_items_reordered():
    assert embedding_val_1[0]["id"] == embedding_val_2[0]["id"]
    ```
    """
+    litellm.set_verbose = True
    litellm.cache = Cache()
    common_msg = f"{uuid.uuid4()}"
    common_msg_2 = f"hey how's it going {uuid.uuid4()}"
@ -532,9 +533,11 @@ async def test_embedding_caching_azure_individual_items_reordered():
    embedding_val_1 = await aembedding(
        model="azure/azure-embedding-model", input=embedding_1, caching=True
    )
+    print("embedding val 1", embedding_val_1)
    embedding_val_2 = await aembedding(
        model="azure/azure-embedding-model", input=embedding_2, caching=True
    )
+    print("embedding val 2", embedding_val_2)
    print(f"embedding_val_2._hidden_params: {embedding_val_2._hidden_params}")
    assert embedding_val_2._hidden_params["cache_hit"] == True

@ -866,7 +869,7 @@ async def test_redis_cache_cluster_init_unit_test():
        from redis.asyncio import RedisCluster as AsyncRedisCluster
        from redis.cluster import RedisCluster

-        from litellm.caching import RedisCache
+        from litellm.caching.caching import RedisCache

        litellm.set_verbose = True

@ -900,7 +903,7 @@ async def test_redis_cache_cluster_init_with_env_vars_unit_test():
        from redis.asyncio import RedisCluster as AsyncRedisCluster
        from redis.cluster import RedisCluster

-        from litellm.caching import RedisCache
+        from litellm.caching.caching import RedisCache

        litellm.set_verbose = True

@ -1554,7 +1557,7 @@ def test_custom_redis_cache_params():


 def test_get_cache_key():
-    from litellm.caching import Cache
+    from litellm.caching.caching import Cache

    try:
        print("Testing get_cache_key")
@ -1989,7 +1992,7 @@ async def test_cache_default_off_acompletion():

    verbose_logger.setLevel(logging.DEBUG)

-    from litellm.caching import CacheMode
+    from litellm.caching.caching import CacheMode

    random_number = random.randint(
        1, 100000
@ -2072,7 +2075,7 @@ async def test_dual_cache_uses_redis():
    - Assert that value from redis is used
    """
    litellm.set_verbose = True
-    from litellm.caching import DualCache, RedisCache
+    from litellm.caching.caching import DualCache, RedisCache

    current_usage = uuid.uuid4()

@ -2095,7 +2098,7 @@ async def test_proxy_logging_setup():
    """
    Assert always_read_redis is True when used by internal usage cache
    """
-    from litellm.caching import DualCache
+    from litellm.caching.caching import DualCache
    from litellm.proxy.utils import ProxyLogging

    pl_obj = ProxyLogging(user_api_key_cache=DualCache())
@ -2165,7 +2168,7 @@ async def test_redis_proxy_batch_redis_get_cache():
    - make 2nd call -> expect hit
    """

-    from litellm.caching import Cache, DualCache
+    from litellm.caching.caching import Cache, DualCache
    from litellm.proxy._types import UserAPIKeyAuth
    from litellm.proxy.hooks.batch_redis_get import _PROXY_BatchRedisRequests

--- a/tests/local_testing/test_caching_ssl.py
+++ b/tests/local_testing/test_caching_ssl.py
@ -15,7 +15,7 @@ sys.path.insert(
 import pytest
 import litellm
 from litellm import embedding, completion, Router
-from litellm.caching import Cache
+from litellm.caching.caching import Cache

 messages = [{"role": "user", "content": f"who is ishaan {time.time()}"}]

--- a/tests/local_testing/test_datadog.py
+++ b/tests/local_testing/test_datadog.py
@ -151,7 +151,7 @@ async def test_datadog_log_redis_failures():
    Test that poorly configured Redis is logged as Warning on DataDog
    """
    try:
-        from litellm.caching import Cache
+        from litellm.caching.caching import Cache
        from litellm.integrations.datadog.datadog import DataDogLogger

        litellm.cache = Cache(
--- a/tests/local_testing/test_jwt.py
+++ b/tests/local_testing/test_jwt.py
@ -24,7 +24,7 @@ import pytest
 from fastapi import Request

 import litellm
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import LiteLLM_JWTAuth, LiteLLM_UserTable, LiteLLMRoutes
 from litellm.proxy.auth.handle_jwt import JWTHandler
 from litellm.proxy.management_endpoints.team_endpoints import new_team
--- a/tests/local_testing/test_key_generate_prisma.py
+++ b/tests/local_testing/test_key_generate_prisma.py
@ -89,7 +89,7 @@ verbose_proxy_logger.setLevel(level=logging.DEBUG)

 from starlette.datastructures import URL

-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import (
    DynamoDBArgs,
    GenerateKeyRequest,
@ -1444,7 +1444,7 @@ def test_call_with_key_over_budget(prisma_client):

            # update spend using track_cost callback, make 2nd request, it should fail
            from litellm import Choices, Message, ModelResponse, Usage
-            from litellm.caching import Cache
+            from litellm.caching.caching import Cache
            from litellm.proxy.proxy_server import (
                _PROXY_track_cost_callback as track_cost_callback,
            )
@ -1564,7 +1564,7 @@ def test_call_with_key_over_budget_no_cache(prisma_client):
            setattr(litellm.proxy.proxy_server, "proxy_batch_write_at", 1)

            from litellm import Choices, Message, ModelResponse, Usage
-            from litellm.caching import Cache
+            from litellm.caching.caching import Cache

            litellm.cache = Cache()
            import time
@ -1685,7 +1685,7 @@ def test_call_with_key_over_model_budget(prisma_client):

            # update spend using track_cost callback, make 2nd request, it should fail
            from litellm import Choices, Message, ModelResponse, Usage
-            from litellm.caching import Cache
+            from litellm.caching.caching import Cache
            from litellm.proxy.proxy_server import (
                _PROXY_track_cost_callback as track_cost_callback,
            )
--- a/tests/local_testing/test_lakera_ai_prompt_injection.py
+++ b/tests/local_testing/test_lakera_ai_prompt_injection.py
@ -25,7 +25,7 @@ import pytest

 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.guardrails.guardrail_hooks.lakera_ai import lakeraAI_Moderation
 from litellm.proxy.proxy_server import embeddings
--- a/tests/local_testing/test_least_busy_routing.py
+++ b/tests/local_testing/test_least_busy_routing.py
@ -20,7 +20,7 @@ import pytest

 import litellm
 from litellm import Router
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.router_strategy.least_busy import LeastBusyLoggingHandler

 ### UNIT TESTS FOR LEAST BUSY LOGGING ###
--- a/tests/local_testing/test_llm_guard.py
+++ b/tests/local_testing/test_llm_guard.py
@ -20,7 +20,7 @@ from litellm.proxy.enterprise.enterprise_hooks.llm_guard import _ENTERPRISE_LLMG
 from litellm import Router, mock_completion
 from litellm.proxy.utils import ProxyLogging, hash_token
 from litellm.proxy._types import UserAPIKeyAuth
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache

 ### UNIT TESTS FOR LLM GUARD ###

--- a/tests/local_testing/test_load_test_router_s3.py
+++ b/tests/local_testing/test_load_test_router_s3.py
@ -10,7 +10,7 @@
 # import asyncio
 # from litellm import Router, Timeout
 # import time
-# from litellm.caching import Cache
+# from litellm.caching.caching import Cache
 # import litellm

 # litellm.cache = Cache(
--- a/tests/local_testing/test_lowest_cost_routing.py
+++ b/tests/local_testing/test_lowest_cost_routing.py
@ -15,7 +15,7 @@ sys.path.insert(
 import pytest
 from litellm import Router
 from litellm.router_strategy.lowest_cost import LowestCostLoggingHandler
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache

 ### UNIT TESTS FOR cost ROUTING ###

--- a/tests/local_testing/test_lowest_latency_routing.py
+++ b/tests/local_testing/test_lowest_latency_routing.py
@ -22,7 +22,7 @@ import pytest

 import litellm
 from litellm import Router
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.router_strategy.lowest_latency import LowestLatencyLoggingHandler

 ### UNIT TESTS FOR LATENCY ROUTING ###
--- a/tests/local_testing/test_max_tpm_rpm_limiter.py
+++ b/tests/local_testing/test_max_tpm_rpm_limiter.py
@ -19,7 +19,7 @@
 # from litellm import Router
 # from litellm.proxy.utils import ProxyLogging, hash_token
 # from litellm.proxy._types import UserAPIKeyAuth
-# from litellm.caching import DualCache, RedisCache
+# from litellm.caching.caching import DualCache, RedisCache
 # from litellm.proxy.hooks.tpm_rpm_limiter import _PROXY_MaxTPMRPMLimiter
 # from datetime import datetime

--- a/tests/local_testing/test_openai_moderations_hook.py
+++ b/tests/local_testing/test_openai_moderations_hook.py
@ -22,7 +22,7 @@ from litellm.proxy.enterprise.enterprise_hooks.openai_moderation import (
 from litellm import Router, mock_completion
 from litellm.proxy.utils import ProxyLogging, hash_token
 from litellm.proxy._types import UserAPIKeyAuth
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache

 ### UNIT TESTS FOR OpenAI Moderation ###

--- a/tests/local_testing/test_parallel_request_limiter.py
+++ b/tests/local_testing/test_parallel_request_limiter.py
@ -23,7 +23,7 @@ import pytest

 import litellm
 from litellm import Router
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.hooks.parallel_request_limiter import (
    _PROXY_MaxParallelRequestsHandler as MaxParallelRequestsHandler,
--- a/tests/local_testing/test_presidio_masking.py
+++ b/tests/local_testing/test_presidio_masking.py
@ -22,7 +22,7 @@ import pytest

 import litellm
 from litellm import Router, mock_completion
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.hooks.presidio_pii_masking import _OPTIONAL_PresidioPIIMasking
 from litellm.proxy.utils import ProxyLogging
--- a/tests/local_testing/test_prometheus_service.py
+++ b/tests/local_testing/test_prometheus_service.py
@ -67,7 +67,7 @@ async def test_completion_with_caching_bad_call():
    litellm.set_verbose = True

    try:
-        from litellm.caching import RedisCache
+        from litellm.caching.caching import RedisCache

        litellm.service_callback = ["prometheus_system"]
        sl = ServiceLogging(mock_testing=True)
--- a/tests/local_testing/test_prompt_injection_detection.py
+++ b/tests/local_testing/test_prompt_injection_detection.py
@ -20,7 +20,7 @@ from litellm.proxy.hooks.prompt_injection_detection import (
 from litellm import Router, mock_completion
 from litellm.proxy.utils import ProxyLogging
 from litellm.proxy._types import UserAPIKeyAuth, LiteLLMPromptInjectionParams
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache


@pytest.mark.asyncio
--- a/tests/local_testing/test_proxy_reject_logging.py
+++ b/tests/local_testing/test_proxy_reject_logging.py
@ -31,7 +31,7 @@ from starlette.datastructures import URL

 import litellm
 from litellm import Router, mock_completion
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.enterprise.enterprise_hooks.secret_detection import (
--- a/tests/local_testing/test_proxy_server.py
+++ b/tests/local_testing/test_proxy_server.py
@ -745,7 +745,7 @@ async def test_team_update_redis():
    """
    Tests if team update, updates the redis cache if set
    """
-    from litellm.caching import DualCache, RedisCache
+    from litellm.caching.caching import DualCache, RedisCache
    from litellm.proxy._types import LiteLLM_TeamTableCachedObj
    from litellm.proxy.auth.auth_checks import _cache_team_object

@ -775,7 +775,7 @@ async def test_get_team_redis(client_no_auth):
    """
    Tests if get_team_object gets value from redis cache, if set
    """
-    from litellm.caching import DualCache, RedisCache
+    from litellm.caching.caching import DualCache, RedisCache
    from litellm.proxy.auth.auth_checks import get_team_object

    proxy_logging_obj: ProxyLogging = getattr(
--- a/tests/local_testing/test_secret_detect_hook.py
+++ b/tests/local_testing/test_secret_detect_hook.py
@ -26,7 +26,7 @@ from starlette.datastructures import URL

 import litellm
 from litellm import Router, mock_completion
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.enterprise.enterprise_hooks.secret_detection import (
--- a/tests/local_testing/test_streaming.py
+++ b/tests/local_testing/test_streaming.py
@ -3128,7 +3128,7 @@ async def test_azure_astreaming_and_function_calling():
            "content": f"What is the weather like in Boston? {uuid.uuid4()}",
        }
    ]
-    from litellm.caching import Cache
+    from litellm.caching.caching import Cache

    litellm.cache = Cache(
        type="redis",
--- a/tests/local_testing/test_tpm_rpm_routing_v2.py
+++ b/tests/local_testing/test_tpm_rpm_routing_v2.py
@ -23,7 +23,7 @@ import pytest

 import litellm
 from litellm import Router
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.router_strategy.lowest_tpm_rpm_v2 import (
    LowestTPMLoggingHandler_v2 as LowestTPMLoggingHandler,
 )
--- a/tests/local_testing/test_update_spend.py
+++ b/tests/local_testing/test_update_spend.py
@ -27,7 +27,7 @@ import pytest
 import litellm
 from litellm import Router, mock_completion
 from litellm._logging import verbose_proxy_logger
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.management_endpoints.internal_user_endpoints import (
    new_user,
@ -53,7 +53,7 @@ verbose_proxy_logger.setLevel(level=logging.DEBUG)

 from starlette.datastructures import URL

-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import (
    BlockUsers,
    DynamoDBArgs,
--- a/tests/local_testing/test_whisper.py
+++ b/tests/local_testing/test_whisper.py
@ -157,7 +157,7 @@ async def test_transcription_on_router():
@pytest.mark.asyncio()
 async def test_transcription_caching():
    import litellm
-    from litellm.caching import Cache
+    from litellm.caching.caching import Cache

    litellm.set_verbose = True
    litellm.cache = Cache()
--- a/tests/proxy_admin_ui_tests/test_key_management.py
+++ b/tests/proxy_admin_ui_tests/test_key_management.py
@ -71,7 +71,7 @@ verbose_proxy_logger.setLevel(level=logging.DEBUG)

 from starlette.datastructures import URL

-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import (
    DynamoDBArgs,
    GenerateKeyRequest,
--- a/tests/proxy_admin_ui_tests/test_role_based_access.py
+++ b/tests/proxy_admin_ui_tests/test_role_based_access.py
@ -78,7 +78,7 @@ verbose_proxy_logger.setLevel(level=logging.DEBUG)

 from starlette.datastructures import URL

-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import *

 proxy_logging_obj = ProxyLogging(user_api_key_cache=DualCache())
--- a/tests/proxy_admin_ui_tests/test_sso_sign_in.py
+++ b/tests/proxy_admin_ui_tests/test_sso_sign_in.py
@ -17,7 +17,7 @@ from litellm.proxy._types import LitellmUserRoles
 import os
 import jwt
 import time
-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache

 proxy_logging_obj = ProxyLogging(user_api_key_cache=DualCache())

--- a/tests/proxy_admin_ui_tests/test_usage_endpoints.py
+++ b/tests/proxy_admin_ui_tests/test_usage_endpoints.py
@ -85,7 +85,7 @@ verbose_proxy_logger.setLevel(level=logging.DEBUG)

 from starlette.datastructures import URL

-from litellm.caching import DualCache
+from litellm.caching.caching import DualCache
 from litellm.proxy._types import (
    DynamoDBArgs,
    GenerateKeyRequest,