litellm-mirror/litellm/proxy/tests/test_gemini_context_caching.py

import datetime

import httpx
import openai

# Set Litellm proxy variables here
LITELLM_BASE_URL = "http://0.0.0.0:4000"
LITELLM_PROXY_API_KEY = "sk-1234"

client = openai.OpenAI(api_key=LITELLM_PROXY_API_KEY, base_url=LITELLM_BASE_URL)
httpx_client = httpx.Client(timeout=30)

################################
# First create a cachedContents object
print("creating cached content")
create_cache = httpx_client.post(
    url=f"{LITELLM_BASE_URL}/vertex-ai/cachedContents",
    headers={"Authorization": f"Bearer {LITELLM_PROXY_API_KEY}"},
    json={
        "model": "gemini-1.5-pro-001",
        "contents": [
            {
                "role": "user",
                "parts": [
                    {
                        "text": "This is sample text to demonstrate explicit caching."
                        * 4000
                    }
                ],
            }
        ],
    },
)
print("response from create_cache", create_cache)
create_cache_response = create_cache.json()
print("json from create_cache", create_cache_response)
cached_content_name = create_cache_response["name"]

#################################
# Use the `cachedContents` object in your /chat/completions
response = client.chat.completions.create(  # type: ignore
    model="gemini-1.5-pro-001",
    max_tokens=8192,
    messages=[
        {
            "role": "user",
            "content": "what is the sample text about?",
        },
    ],
    temperature="0.7",
    extra_body={"cached_content": cached_content_name},  # 👈 key change
)

print("response from proxy", response)