(refactor) get_cache_key to be under 100 LOC function (#6327)

* refactor - use helpers for name space and hashing * use openai to get the relevant supported params * use helpers for getting cache key * fix test caching * use get/set helpers for preset cache keys * make get_cache_key under 100 LOC * fix _get_model_param_value * fix _get_caching_group * fix linting error * add unit testing for get cache key * test_generate_streaming_content
2024-10-19 15:21:11 +05:30 · 2024-10-19 15:21:11 +05:30 · 979e8ea526
commit 979e8ea526
parent 4cbdad9fc5
5 changed files with 477 additions and 124 deletions
--- a/litellm/caching/caching.py
+++ b/litellm/caching/caching.py
@ -17,13 +17,26 @@ import logging
 import time
 import traceback
 from enum import Enum
-from typing import Any, List, Literal, Optional, Tuple, Union
+from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union

+from openai.types.audio.transcription_create_params import TranscriptionCreateParams
+from openai.types.chat.completion_create_params import (
+    CompletionCreateParamsNonStreaming,
+    CompletionCreateParamsStreaming,
+)
+from openai.types.completion_create_params import (
+    CompletionCreateParamsNonStreaming as TextCompletionCreateParamsNonStreaming,
+)
+from openai.types.completion_create_params import (
+    CompletionCreateParamsStreaming as TextCompletionCreateParamsStreaming,
+)
+from openai.types.embedding_create_params import EmbeddingCreateParams
 from pydantic import BaseModel

 import litellm
 from litellm._logging import verbose_logger
 from litellm.types.caching import *
+from litellm.types.rerank import RerankRequest
 from litellm.types.utils import all_litellm_params

 from .base_cache import BaseCache
@ -220,7 +233,7 @@ class Cache:
        if self.namespace is not None and isinstance(self.cache, RedisCache):
            self.cache.namespace = self.namespace

-    def get_cache_key(self, *args, **kwargs) -> str:  # noqa: PLR0915
+    def get_cache_key(self, *args, **kwargs) -> str:
        """
        Get the cache key for the given arguments.

@ -232,107 +245,20 @@ class Cache:
            str: The cache key generated from the arguments, or None if no cache key could be generated.
        """
        cache_key = ""
-        print_verbose(f"\nGetting Cache key. Kwargs: {kwargs}")
+        verbose_logger.debug("\nGetting Cache key. Kwargs: %s", kwargs)

-        # for streaming, we use preset_cache_key. It's created in wrapper(), we do this because optional params like max_tokens, get transformed for bedrock -> max_new_tokens
-        if kwargs.get("litellm_params", {}).get("preset_cache_key", None) is not None:
-            _preset_cache_key = kwargs.get("litellm_params", {}).get(
-                "preset_cache_key", None
-            )
-            print_verbose(f"\nReturning preset cache key: {_preset_cache_key}")
-            return _preset_cache_key
+        preset_cache_key = self._get_preset_cache_key_from_kwargs(**kwargs)
+        if preset_cache_key is not None:
+            verbose_logger.debug("\nReturning preset cache key: %s", preset_cache_key)
+            return preset_cache_key

-        # sort kwargs by keys, since model: [gpt-4, temperature: 0.2, max_tokens: 200] == [temperature: 0.2, max_tokens: 200, model: gpt-4]
-        completion_kwargs = [
-            "model",
-            "messages",
-            "prompt",
-            "temperature",
-            "top_p",
-            "n",
-            "stop",
-            "max_tokens",
-            "presence_penalty",
-            "frequency_penalty",
-            "logit_bias",
-            "user",
-            "response_format",
-            "seed",
-            "tools",
-            "tool_choice",
-            "stream",
-        ]
-        embedding_only_kwargs = [
-            "input",
-            "encoding_format",
-        ]  # embedding kwargs = model, input, user, encoding_format. Model, user are checked in completion_kwargs
-        transcription_only_kwargs = [
-            "file",
-            "language",
-        ]
-        rerank_only_kwargs = [
-            "top_n",
-            "rank_fields",
-            "return_documents",
-            "max_chunks_per_doc",
-            "documents",
-            "query",
-        ]
-        # combined_kwargs - NEEDS to be ordered across get_cache_key(). Do not use a set()
-        combined_kwargs = (
-            completion_kwargs
-            + embedding_only_kwargs
-            + transcription_only_kwargs
-            + rerank_only_kwargs
-        )
+        combined_kwargs = self._get_relevant_args_to_use_for_cache_key()
        litellm_param_kwargs = all_litellm_params
        for param in kwargs:
            if param in combined_kwargs:
-                # check if param == model and model_group is passed in, then override model with model_group
-                if param == "model":
-                    model_group = None
-                    caching_group = None
-                    metadata = kwargs.get("metadata", None)
-                    litellm_params = kwargs.get("litellm_params", {})
-                    if metadata is not None:
-                        model_group = metadata.get("model_group")
-                        model_group = metadata.get("model_group", None)
-                        caching_groups = metadata.get("caching_groups", None)
-                        if caching_groups:
-                            for group in caching_groups:
-                                if model_group in group:
-                                    caching_group = group
-                                    break
-                    if litellm_params is not None:
-                        metadata = litellm_params.get("metadata", None)
-                        if metadata is not None:
-                            model_group = metadata.get("model_group", None)
-                            caching_groups = metadata.get("caching_groups", None)
-                            if caching_groups:
-                                for group in caching_groups:
-                                    if model_group in group:
-                                        caching_group = group
-                                        break
-                    param_value = (
-                        caching_group or model_group or kwargs[param]
-                    )  # use caching_group, if set then model_group if it exists, else use kwargs["model"]
-                elif param == "file":
-                    file = kwargs.get("file")
-                    metadata = kwargs.get("metadata", {})
-                    litellm_params = kwargs.get("litellm_params", {})
-
-                    # get checksum of file content
-                    param_value = (
-                        metadata.get("file_checksum")
-                        or getattr(file, "name", None)
-                        or metadata.get("file_name")
-                        or litellm_params.get("file_name")
-                    )
-                else:
-                    if kwargs[param] is None:
-                        continue  # ignore None params
-                    param_value = kwargs[param]
-                cache_key += f"{str(param)}: {str(param_value)}"
+                param_value: Optional[str] = self._get_param_value(param, kwargs)
+                if param_value is not None:
+                    cache_key += f"{str(param)}: {str(param_value)}"
            elif (
                param not in litellm_param_kwargs
            ):  # check if user passed in optional param - e.g. top_k
@ -344,19 +270,200 @@ class Cache:
                    param_value = kwargs[param]
                    cache_key += f"{str(param)}: {str(param_value)}"

-        print_verbose(f"\nCreated cache key: {cache_key}")
-        # Use hashlib to create a sha256 hash of the cache key
+        verbose_logger.debug("\nCreated cache key: %s", cache_key)
+        hashed_cache_key = self._get_hashed_cache_key(cache_key)
+        hashed_cache_key = self._add_redis_namespace_to_cache_key(
+            hashed_cache_key, **kwargs
+        )
+        self._set_preset_cache_key_in_kwargs(
+            preset_cache_key=hashed_cache_key, **kwargs
+        )
+        return hashed_cache_key
+
+    def _get_param_value(
+        self,
+        param: str,
+        kwargs: dict,
+    ) -> Optional[str]:
+        """
+        Get the value for the given param from kwargs
+        """
+        if param == "model":
+            return self._get_model_param_value(kwargs)
+        elif param == "file":
+            return self._get_file_param_value(kwargs)
+        return kwargs[param]
+
+    def _get_model_param_value(self, kwargs: dict) -> str:
+        """
+        Handles getting the value for the 'model' param from kwargs
+
+        1. If caching groups are set, then return the caching group as the model https://docs.litellm.ai/docs/routing#caching-across-model-groups
+        2. Else if a model_group is set, then return the model_group as the model. This is used for all requests sent through the litellm.Router()
+        3. Else use the `model` passed in kwargs
+        """
+        metadata: Dict = kwargs.get("metadata", {}) or {}
+        litellm_params: Dict = kwargs.get("litellm_params", {}) or {}
+        metadata_in_litellm_params: Dict = litellm_params.get("metadata", {}) or {}
+        model_group: Optional[str] = metadata.get(
+            "model_group"
+        ) or metadata_in_litellm_params.get("model_group")
+        caching_group = self._get_caching_group(metadata, model_group)
+        return caching_group or model_group or kwargs["model"]
+
+    def _get_caching_group(
+        self, metadata: dict, model_group: Optional[str]
+    ) -> Optional[str]:
+        caching_groups: Optional[List] = metadata.get("caching_groups", [])
+        if caching_groups:
+            for group in caching_groups:
+                if model_group in group:
+                    return str(group)
+        return None
+
+    def _get_file_param_value(self, kwargs: dict) -> str:
+        """
+        Handles getting the value for the 'file' param from kwargs. Used for `transcription` requests
+        """
+        file = kwargs.get("file")
+        metadata = kwargs.get("metadata", {})
+        litellm_params = kwargs.get("litellm_params", {})
+        return (
+            metadata.get("file_checksum")
+            or getattr(file, "name", None)
+            or metadata.get("file_name")
+            or litellm_params.get("file_name")
+        )
+
+    def _get_preset_cache_key_from_kwargs(self, **kwargs) -> Optional[str]:
+        """
+        Get the preset cache key from kwargs["litellm_params"]
+
+        We use _get_preset_cache_keys for two reasons
+
+        1. optional params like max_tokens, get transformed for bedrock -> max_new_tokens
+        2. avoid doing duplicate / repeated work
+        """
+        if kwargs:
+            if "litellm_params" in kwargs:
+                return kwargs["litellm_params"].get("preset_cache_key", None)
+        return None
+
+    def _set_preset_cache_key_in_kwargs(self, preset_cache_key: str, **kwargs) -> None:
+        """
+        Set the calculated cache key in kwargs
+
+        This is used to avoid doing duplicate / repeated work
+
+        Placed in kwargs["litellm_params"]
+        """
+        if kwargs:
+            if "litellm_params" in kwargs:
+                kwargs["litellm_params"]["preset_cache_key"] = preset_cache_key
+
+    def _get_relevant_args_to_use_for_cache_key(self) -> Set[str]:
+        """
+        Gets the supported kwargs for each call type and combines them
+        """
+        chat_completion_kwargs = self._get_litellm_supported_chat_completion_kwargs()
+        text_completion_kwargs = self._get_litellm_supported_text_completion_kwargs()
+        embedding_kwargs = self._get_litellm_supported_embedding_kwargs()
+        transcription_kwargs = self._get_litellm_supported_transcription_kwargs()
+        rerank_kwargs = self._get_litellm_supported_rerank_kwargs()
+        exclude_kwargs = self._get_kwargs_to_exclude_from_cache_key()
+
+        combined_kwargs = chat_completion_kwargs.union(
+            text_completion_kwargs,
+            embedding_kwargs,
+            transcription_kwargs,
+            rerank_kwargs,
+        )
+        combined_kwargs = combined_kwargs.difference(exclude_kwargs)
+        return combined_kwargs
+
+    def _get_litellm_supported_chat_completion_kwargs(self) -> Set[str]:
+        """
+        Get the litellm supported chat completion kwargs
+
+        This follows the OpenAI API Spec
+        """
+        all_chat_completion_kwargs = set(
+            CompletionCreateParamsNonStreaming.__annotations__.keys()
+        ).union(set(CompletionCreateParamsStreaming.__annotations__.keys()))
+        return all_chat_completion_kwargs
+
+    def _get_litellm_supported_text_completion_kwargs(self) -> Set[str]:
+        """
+        Get the litellm supported text completion kwargs
+
+        This follows the OpenAI API Spec
+        """
+        all_text_completion_kwargs = set(
+            TextCompletionCreateParamsNonStreaming.__annotations__.keys()
+        ).union(set(TextCompletionCreateParamsStreaming.__annotations__.keys()))
+        return all_text_completion_kwargs
+
+    def _get_litellm_supported_rerank_kwargs(self) -> Set[str]:
+        """
+        Get the litellm supported rerank kwargs
+        """
+        return set(RerankRequest.model_fields.keys())
+
+    def _get_litellm_supported_embedding_kwargs(self) -> Set[str]:
+        """
+        Get the litellm supported embedding kwargs
+
+        This follows the OpenAI API Spec
+        """
+        return set(EmbeddingCreateParams.__annotations__.keys())
+
+    def _get_litellm_supported_transcription_kwargs(self) -> Set[str]:
+        """
+        Get the litellm supported transcription kwargs
+
+        This follows the OpenAI API Spec
+        """
+        return set(TranscriptionCreateParams.__annotations__.keys())
+
+    def _get_kwargs_to_exclude_from_cache_key(self) -> Set[str]:
+        """
+        Get the kwargs to exclude from the cache key
+        """
+        return set(["metadata"])
+
+    def _get_hashed_cache_key(self, cache_key: str) -> str:
+        """
+        Get the hashed cache key for the given cache key.
+
+        Use hashlib to create a sha256 hash of the cache key
+
+        Args:
+            cache_key (str): The cache key to hash.
+
+        Returns:
+            str: The hashed cache key.
+        """
        hash_object = hashlib.sha256(cache_key.encode())
        # Hexadecimal representation of the hash
        hash_hex = hash_object.hexdigest()
-        print_verbose(f"Hashed cache key (SHA-256): {hash_hex}")
-        if kwargs.get("metadata", {}).get("redis_namespace", None) is not None:
-            _namespace = kwargs.get("metadata", {}).get("redis_namespace", None)
-            hash_hex = f"{_namespace}:{hash_hex}"
-            print_verbose(f"Hashed Key with Namespace: {hash_hex}")
-        elif self.namespace is not None:
-            hash_hex = f"{self.namespace}:{hash_hex}"
-            print_verbose(f"Hashed Key with Namespace: {hash_hex}")
+        verbose_logger.debug("Hashed cache key (SHA-256): %s", hash_hex)
+        return hash_hex
+
+    def _add_redis_namespace_to_cache_key(self, hash_hex: str, **kwargs) -> str:
+        """
+        If a redis namespace is provided, add it to the cache key
+
+        Args:
+            hash_hex (str): The hashed cache key.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            str: The final hashed cache key with the redis namespace.
+        """
+        namespace = kwargs.get("metadata", {}).get("redis_namespace") or self.namespace
+        if namespace:
+            hash_hex = f"{namespace}:{hash_hex}"
+        verbose_logger.debug("Final hashed key: %s", hash_hex)
        return hash_hex

    def generate_streaming_content(self, content):
--- a/litellm/caching/caching_handler.py
+++ b/litellm/caching/caching_handler.py
@ -182,7 +182,9 @@ class LLMCachingHandler:
                            end_time=end_time,
                            cache_hit=cache_hit,
                        )
-                    cache_key = kwargs.get("preset_cache_key", None)
+                    cache_key = litellm.cache._get_preset_cache_key_from_kwargs(
+                        **kwargs
+                    )
                    if (
                        isinstance(cached_result, BaseModel)
                        or isinstance(cached_result, CustomStreamWrapper)
@ -236,12 +238,7 @@ class LLMCachingHandler:
            original_function=original_function
        ):
            print_verbose("Checking Cache")
-            preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
-            kwargs["preset_cache_key"] = (
-                preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
-            )
            cached_result = litellm.cache.get_cache(*args, **kwargs)
-
            if cached_result is not None:
                if "detail" in cached_result:
                    # implies an error occurred
@ -285,7 +282,9 @@ class LLMCachingHandler:
                        target=logging_obj.success_handler,
                        args=(cached_result, start_time, end_time, cache_hit),
                    ).start()
-                    cache_key = kwargs.get("preset_cache_key", None)
+                    cache_key = litellm.cache._get_preset_cache_key_from_kwargs(
+                        **kwargs
+                    )
                    if (
                        isinstance(cached_result, BaseModel)
                        or isinstance(cached_result, CustomStreamWrapper)
@ -493,10 +492,6 @@ class LLMCachingHandler:
                if all(result is None for result in cached_result):
                    cached_result = None
        else:
-            preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
-            kwargs["preset_cache_key"] = (
-                preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
-            )
            if litellm.cache._supports_async() is True:
                cached_result = await litellm.cache.async_get_cache(*args, **kwargs)
            else:  # for s3 caching. [NOT RECOMMENDED IN PROD - this will slow down responses since boto3 is sync]
@ -842,10 +837,16 @@ class LLMCachingHandler:
            "metadata": kwargs.get("metadata", {}),
            "model_info": kwargs.get("model_info", {}),
            "proxy_server_request": kwargs.get("proxy_server_request", None),
-            "preset_cache_key": kwargs.get("preset_cache_key", None),
            "stream_response": kwargs.get("stream_response", {}),
        }

+        if litellm.cache is not None:
+            litellm_params["preset_cache_key"] = (
+                litellm.cache._get_preset_cache_key_from_kwargs(**kwargs)
+            )
+        else:
+            litellm_params["preset_cache_key"] = None
+
        logging_obj.update_environment_variables(
            model=model,
            user=kwargs.get("user", None),
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -664,10 +664,10 @@ class LangFuseLogger:
            if "cache_key" in litellm.langfuse_default_tags:
                _hidden_params = metadata.get("hidden_params", {}) or {}
                _cache_key = _hidden_params.get("cache_key", None)
-                if _cache_key is None:
+                if _cache_key is None and litellm.cache is not None:
                    # fallback to using "preset_cache_key"
-                    _preset_cache_key = kwargs.get("litellm_params", {}).get(
-                        "preset_cache_key", None
+                    _preset_cache_key = litellm.cache._get_preset_cache_key_from_kwargs(
+                        **kwargs
                    )
                    _cache_key = _preset_cache_key
                tags.append(f"cache_key:{_cache_key}")
--- a/tests/local_testing/test_caching.py
+++ b/tests/local_testing/test_caching.py
@ -974,7 +974,7 @@ async def test_redis_cache_acompletion_stream():
            response_1_content += chunk.choices[0].delta.content or ""
        print(response_1_content)

-        time.sleep(0.5)
+        await asyncio.sleep(0.5)
        print("\n\n Response 1 content: ", response_1_content, "\n\n")

        response2 = await litellm.acompletion(
--- a/tests/local_testing/test_unit_test_caching.py
+++ b/tests/local_testing/test_unit_test_caching.py
@ -0,0 +1,245 @@
+import os
+import sys
+import time
+import traceback
+import uuid
+
+from dotenv import load_dotenv
+from test_rerank import assert_response_shape
+
+
+load_dotenv()
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import asyncio
+import hashlib
+import random
+
+import pytest
+
+import litellm
+from litellm import aembedding, completion, embedding
+from litellm.caching.caching import Cache
+
+from unittest.mock import AsyncMock, patch, MagicMock
+from litellm.caching.caching_handler import LLMCachingHandler, CachingHandlerResponse
+from litellm.caching.caching import LiteLLMCacheType
+from litellm.types.utils import CallTypes
+from litellm.types.rerank import RerankResponse
+from litellm.types.utils import (
+    ModelResponse,
+    EmbeddingResponse,
+    TextCompletionResponse,
+    TranscriptionResponse,
+    Embedding,
+)
+from datetime import timedelta, datetime
+from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging
+from litellm._logging import verbose_logger
+import logging
+
+
+def test_get_kwargs_for_cache_key():
+    _cache = litellm.Cache()
+    relevant_kwargs = _cache._get_relevant_args_to_use_for_cache_key()
+    print(relevant_kwargs)
+
+
+def test_get_cache_key_chat_completion():
+    cache = Cache()
+    kwargs = {
+        "model": "gpt-3.5-turbo",
+        "messages": [{"role": "user", "content": "Hello, world!"}],
+        "temperature": 0.7,
+    }
+    cache_key_1 = cache.get_cache_key(**kwargs)
+    assert isinstance(cache_key_1, str)
+    assert len(cache_key_1) > 0
+
+    kwargs_2 = {
+        "model": "gpt-3.5-turbo",
+        "messages": [{"role": "user", "content": "Hello, world!"}],
+        "max_completion_tokens": 100,
+    }
+    cache_key_2 = cache.get_cache_key(**kwargs_2)
+    assert cache_key_1 != cache_key_2
+
+    kwargs_3 = {
+        "model": "gpt-3.5-turbo",
+        "messages": [{"role": "user", "content": "Hello, world!"}],
+        "max_completion_tokens": 100,
+    }
+    cache_key_3 = cache.get_cache_key(**kwargs_3)
+    assert cache_key_2 == cache_key_3
+
+
+def test_get_cache_key_embedding():
+    cache = Cache()
+    kwargs = {
+        "model": "text-embedding-3-small",
+        "input": "Hello, world!",
+        "dimensions": 1536,
+    }
+    cache_key_1 = cache.get_cache_key(**kwargs)
+    assert isinstance(cache_key_1, str)
+    assert len(cache_key_1) > 0
+
+    kwargs_2 = {
+        "model": "text-embedding-3-small",
+        "input": "Hello, world!",
+        "dimensions": 1539,
+    }
+    cache_key_2 = cache.get_cache_key(**kwargs_2)
+    assert cache_key_1 != cache_key_2
+
+    kwargs_3 = {
+        "model": "text-embedding-3-small",
+        "input": "Hello, world!",
+        "dimensions": 1539,
+    }
+    cache_key_3 = cache.get_cache_key(**kwargs_3)
+    assert cache_key_2 == cache_key_3
+
+
+def test_get_cache_key_text_completion():
+    cache = Cache()
+    kwargs = {
+        "model": "gpt-3.5-turbo",
+        "prompt": "Hello, world! here is a second line",
+        "best_of": 3,
+        "logit_bias": {"123": 1},
+        "seed": 42,
+    }
+    cache_key_1 = cache.get_cache_key(**kwargs)
+    assert isinstance(cache_key_1, str)
+    assert len(cache_key_1) > 0
+
+    kwargs_2 = {
+        "model": "gpt-3.5-turbo",
+        "prompt": "Hello, world! here is a second line",
+        "best_of": 30,
+    }
+    cache_key_2 = cache.get_cache_key(**kwargs_2)
+    assert cache_key_1 != cache_key_2
+
+    kwargs_3 = {
+        "model": "gpt-3.5-turbo",
+        "prompt": "Hello, world! here is a second line",
+        "best_of": 30,
+    }
+    cache_key_3 = cache.get_cache_key(**kwargs_3)
+    assert cache_key_2 == cache_key_3
+
+
+def test_get_hashed_cache_key():
+    cache = Cache()
+    cache_key = "model:gpt-3.5-turbo,messages:Hello world"
+    hashed_key = cache._get_hashed_cache_key(cache_key)
+    assert len(hashed_key) == 64  # SHA-256 produces a 64-character hex string
+
+
+def test_add_redis_namespace_to_cache_key():
+    cache = Cache(namespace="test_namespace")
+    hashed_key = "abcdef1234567890"
+
+    # Test with class-level namespace
+    result = cache._add_redis_namespace_to_cache_key(hashed_key)
+    assert result == "test_namespace:abcdef1234567890"
+
+    # Test with metadata namespace
+    kwargs = {"metadata": {"redis_namespace": "custom_namespace"}}
+    result = cache._add_redis_namespace_to_cache_key(hashed_key, **kwargs)
+    assert result == "custom_namespace:abcdef1234567890"
+
+
+def test_get_model_param_value():
+    cache = Cache()
+
+    # Test with regular model
+    kwargs = {"model": "gpt-3.5-turbo"}
+    assert cache._get_model_param_value(kwargs) == "gpt-3.5-turbo"
+
+    # Test with model_group
+    kwargs = {"model": "gpt-3.5-turbo", "metadata": {"model_group": "gpt-group"}}
+    assert cache._get_model_param_value(kwargs) == "gpt-group"
+
+    # Test with caching_group
+    kwargs = {
+        "model": "gpt-3.5-turbo",
+        "metadata": {
+            "model_group": "openai-gpt-3.5-turbo",
+            "caching_groups": [("openai-gpt-3.5-turbo", "azure-gpt-3.5-turbo")],
+        },
+    }
+    assert (
+        cache._get_model_param_value(kwargs)
+        == "('openai-gpt-3.5-turbo', 'azure-gpt-3.5-turbo')"
+    )
+
+    kwargs = {
+        "model": "gpt-3.5-turbo",
+        "metadata": {
+            "model_group": "azure-gpt-3.5-turbo",
+            "caching_groups": [("openai-gpt-3.5-turbo", "azure-gpt-3.5-turbo")],
+        },
+    }
+    assert (
+        cache._get_model_param_value(kwargs)
+        == "('openai-gpt-3.5-turbo', 'azure-gpt-3.5-turbo')"
+    )
+
+    kwargs = {
+        "model": "gpt-3.5-turbo",
+        "metadata": {
+            "model_group": "not-in-caching-group-gpt-3.5-turbo",
+            "caching_groups": [("openai-gpt-3.5-turbo", "azure-gpt-3.5-turbo")],
+        },
+    }
+    assert cache._get_model_param_value(kwargs) == "not-in-caching-group-gpt-3.5-turbo"
+
+
+def test_preset_cache_key():
+    """
+    Test that the preset cache key is used if it is set in kwargs["litellm_params"]
+    """
+    cache = Cache()
+    kwargs = {
+        "model": "gpt-3.5-turbo",
+        "messages": [{"role": "user", "content": "Hello, world!"}],
+        "temperature": 0.7,
+        "litellm_params": {"preset_cache_key": "preset-cache-key"},
+    }
+
+    assert cache.get_cache_key(**kwargs) == "preset-cache-key"
+
+
+def test_generate_streaming_content():
+    cache = Cache()
+    content = "Hello, this is a test message."
+    generator = cache.generate_streaming_content(content)
+
+    full_response = ""
+    chunk_count = 0
+
+    for chunk in generator:
+        chunk_count += 1
+        assert "choices" in chunk
+        assert len(chunk["choices"]) == 1
+        assert "delta" in chunk["choices"][0]
+        assert "role" in chunk["choices"][0]["delta"]
+        assert chunk["choices"][0]["delta"]["role"] == "assistant"
+        assert "content" in chunk["choices"][0]["delta"]
+
+        chunk_content = chunk["choices"][0]["delta"]["content"]
+        full_response += chunk_content
+
+        # Check that each chunk is no longer than 5 characters
+        assert len(chunk_content) <= 5
+    print("full_response from generate_streaming_content", full_response)
+    # Check that the full content is reconstructed correctly
+    assert full_response == content
+    # Check that there were multiple chunks
+    assert chunk_count > 1
+
+    print(f"Number of chunks: {chunk_count}")