docs cachedContent endpoint

2024-08-08 16:06:23 -07:00 · 2024-08-08 16:06:23 -07:00 · a3dd3a19fa
commit a3dd3a19fa
parent cae941f4c0
2 changed files with 81 additions and 39 deletions
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -440,12 +440,20 @@ Use Vertex AI Context Caching
 1. Add model to config.yaml
 ```yaml
 model_list:
+  # used for /chat/completions, /completions, /embeddings endpoints
  - model_name: gemini-1.5-pro-001
    litellm_params:
      model: vertex_ai_beta/gemini-1.5-pro-001
      vertex_project: "project-id"
      vertex_location: "us-central1"
      vertex_credentials: "/path/to/service_account.json" # [OPTIONAL] Do this OR `!gcloud auth application-default login` - run this to add vertex credentials to your env
+
+# used for the /cachedContent and vertexAI native endpoints
+default_vertex_config:
+  vertex_project: "adroit-crow-413218"
+  vertex_location: "us-central1"
+  vertex_credentials: "adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
+
 ```

 2. Start Proxy 
@ -456,40 +464,58 @@ $ litellm --config /path/to/config.yaml

 3. Make Request!

+- First create a cachedContents object by calling the Vertex `cachedContents` endpoint. [VertexAI API Ref for cachedContents endpoint](https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache/context-cache-create#create-context-cache-sample-drest). (LiteLLM proxy forwards the `/cachedContents` request to the VertexAI API)
+- Use the `cachedContents` object in your /chat/completions request to vertexAI
+
 ```python
 import datetime
 import openai
-import vertexai
-from vertexai.generative_models import Content, Part
-from vertexai.preview import caching
-from vertexai.preview.generative_models import GenerativeModel
+import httpx

-# use Vertex AI SDK to create CachedContent
-vertexai.init(project="adroit-crow-413218", location="us-central1")
+# Set Litellm proxy variables here
+LITELLM_BASE_URL = "http://0.0.0.0:4000"
+LITELLM_PROXY_API_KEY = "sk-1234"
+
+client = openai.OpenAI(api_key=LITELLM_PROXY_API_KEY, base_url=LITELLM_BASE_URL)
+httpx_client = httpx.Client(timeout=30)
+
+################################
+# First create a cachedContents object
+# this request gets forwarded as is to: https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache/context-cache-create#create-context-cache-sample-drest
 print("creating cached content")
-contents_here: list[Content] = [
-    Content(role="user", parts=[Part.from_text("huge string of text here" * 10000)])
-]
-cached_content = caching.CachedContent.create(
-    model_name="gemini-1.5-pro-001",
-    contents=contents_here,
-    expire_time=datetime.datetime(2024, 8, 10),
+create_cache = httpx_client.post(
+    url=f"{LITELLM_BASE_URL}/vertex-ai/cachedContents",
+    headers = {"Authorization": f"Bearer {LITELLM_PROXY_API_KEY}"},
+    json = {
+        "model": "gemini-1.5-pro-001",
+        "contents": [
+            {
+                "role": "user",
+                "parts": [{
+                    "text": "This is sample text to demonstrate explicit caching."*4000
+                }]
+            }
+        ],
+    }
 )
+print("response from create_cache", create_cache)
+create_cache_response = create_cache.json()
+print("json from create_cache", create_cache_response)
+cached_content_name = create_cache_response["name"]

-
-# use OpenAI SDK to send a request to LiteLLM Proxy
-# base_url is litellm proxy server and api_key is api key to litellm proxy
-client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
-response = client.chat.completions.create(
+#################################
+# Use the `cachedContents` object in your /chat/completions
+response = client.chat.completions.create(  # type: ignore
    model="gemini-1.5-pro-001",
+    max_tokens=8192,
    messages=[
        {
            "role": "user",
-            "content": "hello!",
+            "content": "what is the sample text about?",
        },
    ],
    temperature="0.7",
-    extra_body={"cached_content": cached_content.resource_name},
+    extra_body={"cached_content": cached_content_name}, # 👈 key change
 )

 print("response from proxy", response)
--- a/litellm/proxy/tests/test_gemini_context_caching.py
+++ b/litellm/proxy/tests/test_gemini_context_caching.py
@ -1,38 +1,54 @@
 import datetime

+import httpx
 import openai
-import vertexai
-from vertexai.generative_models import Content, Part
-from vertexai.preview import caching
-from vertexai.preview.generative_models import GenerativeModel

-client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
-vertexai.init(project="adroit-crow-413218", location="us-central1")
+# Set Litellm proxy variables here
+LITELLM_BASE_URL = "http://0.0.0.0:4000"
+LITELLM_PROXY_API_KEY = "sk-1234"
+
+client = openai.OpenAI(api_key=LITELLM_PROXY_API_KEY, base_url=LITELLM_BASE_URL)
+httpx_client = httpx.Client(timeout=30)
+
+################################
+# First create a cachedContents object
 print("creating cached content")
-contents_here: list[Content] = [
-    Content(role="user", parts=[Part.from_text("huge string of text here" * 10000)])
-]
-cached_content = caching.CachedContent.create(
-    model_name="gemini-1.5-pro-001",
-    contents=contents_here,
-    expire_time=datetime.datetime(2024, 8, 10),
+create_cache = httpx_client.post(
+    url=f"{LITELLM_BASE_URL}/vertex-ai/cachedContents",
+    headers={"Authorization": f"Bearer {LITELLM_PROXY_API_KEY}"},
+    json={
+        "model": "gemini-1.5-pro-001",
+        "contents": [
+            {
+                "role": "user",
+                "parts": [
+                    {
+                        "text": "This is sample text to demonstrate explicit caching."
+                        * 4000
+                    }
+                ],
+            }
+        ],
+    },
 )
+print("response from create_cache", create_cache)
+create_cache_response = create_cache.json()
+print("json from create_cache", create_cache_response)
+cached_content_name = create_cache_response["name"]

-created_Caches = caching.CachedContent.list()
-
-print("created_Caches contents=", created_Caches)
-
+#################################
+# Use the `cachedContents` object in your /chat/completions
 response = client.chat.completions.create(  # type: ignore
    model="gemini-1.5-pro-001",
    max_tokens=8192,
    messages=[
        {
            "role": "user",
-            "content": "quote all everything above this message",
+            "content": "what is the sample text about?",
        },
    ],
    temperature="0.7",
-    extra_body={"cached_content": cached_content.resource_name},
+    extra_body={"cached_content": cached_content_name},  # 👈 key change
 )

 print("response from proxy", response)