feat(vertex_ai_context_caching.py): support making context caching calls to vertex ai in a normal chat completion call (anthropic caching format)

Closes https://github.com/BerriAI/litellm/issues/5213
2025-04-25 18:54:30 +00:00 · 2024-08-26 18:47:45 -07:00 · 2024-08-26 18:47:45 -07:00 · b0cc1df2d6
commit b0cc1df2d6
parent c503ff435e
16 changed files with 594 additions and 90 deletions
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -69,6 +69,7 @@ from litellm.litellm_core_utils.redact_messages import (
 from litellm.litellm_core_utils.token_counter import get_modified_max_tokens
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.types.llms.openai import (
+    AllMessageValues,
    ChatCompletionNamedToolChoiceParam,
    ChatCompletionToolParam,
 )
@ -11549,3 +11550,25 @@ class ModelResponseListIterator:
 class CustomModelResponseIterator(Iterable):
    def __init__(self) -> None:
        super().__init__()
+
+
+def is_cached_message(message: AllMessageValues) -> bool:
+    """
+    Returns true, if message is marked as needing to be cached.
+
+    Used for anthropic/gemini context caching.
+
+    Follows the anthropic format {"cache_control": {"type": "ephemeral"}}
+    """
+    if message["content"] is None or isinstance(message["content"], str):
+        return False
+
+    for content in message["content"]:
+        if (
+            content["type"] == "text"
+            and content.get("cache_control") is not None
+            and content["cache_control"]["type"] == "ephemeral"  # type: ignore
+        ):
+            return True
+
+    return False