diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md index a0e243a65..e65a104ef 100644 --- a/docs/my-website/docs/providers/vertex.md +++ b/docs/my-website/docs/providers/vertex.md @@ -440,12 +440,20 @@ Use Vertex AI Context Caching 1. Add model to config.yaml ```yaml model_list: + # used for /chat/completions, /completions, /embeddings endpoints - model_name: gemini-1.5-pro-001 litellm_params: model: vertex_ai_beta/gemini-1.5-pro-001 vertex_project: "project-id" vertex_location: "us-central1" vertex_credentials: "/path/to/service_account.json" # [OPTIONAL] Do this OR `!gcloud auth application-default login` - run this to add vertex credentials to your env + +# used for the /cachedContent and vertexAI native endpoints +default_vertex_config: + vertex_project: "adroit-crow-413218" + vertex_location: "us-central1" + vertex_credentials: "adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json + ``` 2. Start Proxy @@ -456,40 +464,58 @@ $ litellm --config /path/to/config.yaml 3. Make Request! +- First create a cachedContents object by calling the Vertex `cachedContents` endpoint. [VertexAI API Ref for cachedContents endpoint](https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache/context-cache-create#create-context-cache-sample-drest). (LiteLLM proxy forwards the `/cachedContents` request to the VertexAI API) +- Use the `cachedContents` object in your /chat/completions request to vertexAI + ```python import datetime import openai -import vertexai -from vertexai.generative_models import Content, Part -from vertexai.preview import caching -from vertexai.preview.generative_models import GenerativeModel +import httpx -# use Vertex AI SDK to create CachedContent -vertexai.init(project="adroit-crow-413218", location="us-central1") +# Set Litellm proxy variables here +LITELLM_BASE_URL = "http://0.0.0.0:4000" +LITELLM_PROXY_API_KEY = "sk-1234" + +client = openai.OpenAI(api_key=LITELLM_PROXY_API_KEY, base_url=LITELLM_BASE_URL) +httpx_client = httpx.Client(timeout=30) + +################################ +# First create a cachedContents object +# this request gets forwarded as is to: https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache/context-cache-create#create-context-cache-sample-drest print("creating cached content") -contents_here: list[Content] = [ - Content(role="user", parts=[Part.from_text("huge string of text here" * 10000)]) -] -cached_content = caching.CachedContent.create( - model_name="gemini-1.5-pro-001", - contents=contents_here, - expire_time=datetime.datetime(2024, 8, 10), +create_cache = httpx_client.post( + url=f"{LITELLM_BASE_URL}/vertex-ai/cachedContents", + headers = {"Authorization": f"Bearer {LITELLM_PROXY_API_KEY}"}, + json = { + "model": "gemini-1.5-pro-001", + "contents": [ + { + "role": "user", + "parts": [{ + "text": "This is sample text to demonstrate explicit caching."*4000 + }] + } + ], + } ) +print("response from create_cache", create_cache) +create_cache_response = create_cache.json() +print("json from create_cache", create_cache_response) +cached_content_name = create_cache_response["name"] - -# use OpenAI SDK to send a request to LiteLLM Proxy -# base_url is litellm proxy server and api_key is api key to litellm proxy -client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") -response = client.chat.completions.create( +################################# +# Use the `cachedContents` object in your /chat/completions +response = client.chat.completions.create( # type: ignore model="gemini-1.5-pro-001", + max_tokens=8192, messages=[ { "role": "user", - "content": "hello!", + "content": "what is the sample text about?", }, ], temperature="0.7", - extra_body={"cached_content": cached_content.resource_name}, + extra_body={"cached_content": cached_content_name}, # 👈 key change ) print("response from proxy", response) diff --git a/litellm/proxy/tests/test_gemini_context_caching.py b/litellm/proxy/tests/test_gemini_context_caching.py index 40caaf1ea..6ee143dba 100644 --- a/litellm/proxy/tests/test_gemini_context_caching.py +++ b/litellm/proxy/tests/test_gemini_context_caching.py @@ -1,38 +1,54 @@ import datetime +import httpx import openai -import vertexai -from vertexai.generative_models import Content, Part -from vertexai.preview import caching -from vertexai.preview.generative_models import GenerativeModel -client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") -vertexai.init(project="adroit-crow-413218", location="us-central1") +# Set Litellm proxy variables here +LITELLM_BASE_URL = "http://0.0.0.0:4000" +LITELLM_PROXY_API_KEY = "sk-1234" + +client = openai.OpenAI(api_key=LITELLM_PROXY_API_KEY, base_url=LITELLM_BASE_URL) +httpx_client = httpx.Client(timeout=30) + +################################ +# First create a cachedContents object print("creating cached content") -contents_here: list[Content] = [ - Content(role="user", parts=[Part.from_text("huge string of text here" * 10000)]) -] -cached_content = caching.CachedContent.create( - model_name="gemini-1.5-pro-001", - contents=contents_here, - expire_time=datetime.datetime(2024, 8, 10), +create_cache = httpx_client.post( + url=f"{LITELLM_BASE_URL}/vertex-ai/cachedContents", + headers={"Authorization": f"Bearer {LITELLM_PROXY_API_KEY}"}, + json={ + "model": "gemini-1.5-pro-001", + "contents": [ + { + "role": "user", + "parts": [ + { + "text": "This is sample text to demonstrate explicit caching." + * 4000 + } + ], + } + ], + }, ) +print("response from create_cache", create_cache) +create_cache_response = create_cache.json() +print("json from create_cache", create_cache_response) +cached_content_name = create_cache_response["name"] -created_Caches = caching.CachedContent.list() - -print("created_Caches contents=", created_Caches) - +################################# +# Use the `cachedContents` object in your /chat/completions response = client.chat.completions.create( # type: ignore model="gemini-1.5-pro-001", max_tokens=8192, messages=[ { "role": "user", - "content": "quote all everything above this message", + "content": "what is the sample text about?", }, ], temperature="0.7", - extra_body={"cached_content": cached_content.resource_name}, + extra_body={"cached_content": cached_content_name}, # 👈 key change ) print("response from proxy", response)