forked from phoenix/litellm-mirror
docs cachedContent endpoint
This commit is contained in:
parent
cae941f4c0
commit
a3dd3a19fa
2 changed files with 81 additions and 39 deletions
|
@ -440,12 +440,20 @@ Use Vertex AI Context Caching
|
||||||
1. Add model to config.yaml
|
1. Add model to config.yaml
|
||||||
```yaml
|
```yaml
|
||||||
model_list:
|
model_list:
|
||||||
|
# used for /chat/completions, /completions, /embeddings endpoints
|
||||||
- model_name: gemini-1.5-pro-001
|
- model_name: gemini-1.5-pro-001
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: vertex_ai_beta/gemini-1.5-pro-001
|
model: vertex_ai_beta/gemini-1.5-pro-001
|
||||||
vertex_project: "project-id"
|
vertex_project: "project-id"
|
||||||
vertex_location: "us-central1"
|
vertex_location: "us-central1"
|
||||||
vertex_credentials: "/path/to/service_account.json" # [OPTIONAL] Do this OR `!gcloud auth application-default login` - run this to add vertex credentials to your env
|
vertex_credentials: "/path/to/service_account.json" # [OPTIONAL] Do this OR `!gcloud auth application-default login` - run this to add vertex credentials to your env
|
||||||
|
|
||||||
|
# used for the /cachedContent and vertexAI native endpoints
|
||||||
|
default_vertex_config:
|
||||||
|
vertex_project: "adroit-crow-413218"
|
||||||
|
vertex_location: "us-central1"
|
||||||
|
vertex_credentials: "adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Start Proxy
|
2. Start Proxy
|
||||||
|
@ -456,40 +464,58 @@ $ litellm --config /path/to/config.yaml
|
||||||
|
|
||||||
3. Make Request!
|
3. Make Request!
|
||||||
|
|
||||||
|
- First create a cachedContents object by calling the Vertex `cachedContents` endpoint. [VertexAI API Ref for cachedContents endpoint](https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache/context-cache-create#create-context-cache-sample-drest). (LiteLLM proxy forwards the `/cachedContents` request to the VertexAI API)
|
||||||
|
- Use the `cachedContents` object in your /chat/completions request to vertexAI
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import datetime
|
import datetime
|
||||||
import openai
|
import openai
|
||||||
import vertexai
|
import httpx
|
||||||
from vertexai.generative_models import Content, Part
|
|
||||||
from vertexai.preview import caching
|
|
||||||
from vertexai.preview.generative_models import GenerativeModel
|
|
||||||
|
|
||||||
# use Vertex AI SDK to create CachedContent
|
# Set Litellm proxy variables here
|
||||||
vertexai.init(project="adroit-crow-413218", location="us-central1")
|
LITELLM_BASE_URL = "http://0.0.0.0:4000"
|
||||||
|
LITELLM_PROXY_API_KEY = "sk-1234"
|
||||||
|
|
||||||
|
client = openai.OpenAI(api_key=LITELLM_PROXY_API_KEY, base_url=LITELLM_BASE_URL)
|
||||||
|
httpx_client = httpx.Client(timeout=30)
|
||||||
|
|
||||||
|
################################
|
||||||
|
# First create a cachedContents object
|
||||||
|
# this request gets forwarded as is to: https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache/context-cache-create#create-context-cache-sample-drest
|
||||||
print("creating cached content")
|
print("creating cached content")
|
||||||
contents_here: list[Content] = [
|
create_cache = httpx_client.post(
|
||||||
Content(role="user", parts=[Part.from_text("huge string of text here" * 10000)])
|
url=f"{LITELLM_BASE_URL}/vertex-ai/cachedContents",
|
||||||
]
|
headers = {"Authorization": f"Bearer {LITELLM_PROXY_API_KEY}"},
|
||||||
cached_content = caching.CachedContent.create(
|
json = {
|
||||||
model_name="gemini-1.5-pro-001",
|
"model": "gemini-1.5-pro-001",
|
||||||
contents=contents_here,
|
"contents": [
|
||||||
expire_time=datetime.datetime(2024, 8, 10),
|
{
|
||||||
|
"role": "user",
|
||||||
|
"parts": [{
|
||||||
|
"text": "This is sample text to demonstrate explicit caching."*4000
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
print("response from create_cache", create_cache)
|
||||||
|
create_cache_response = create_cache.json()
|
||||||
|
print("json from create_cache", create_cache_response)
|
||||||
|
cached_content_name = create_cache_response["name"]
|
||||||
|
|
||||||
|
#################################
|
||||||
# use OpenAI SDK to send a request to LiteLLM Proxy
|
# Use the `cachedContents` object in your /chat/completions
|
||||||
# base_url is litellm proxy server and api_key is api key to litellm proxy
|
response = client.chat.completions.create( # type: ignore
|
||||||
client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
|
|
||||||
response = client.chat.completions.create(
|
|
||||||
model="gemini-1.5-pro-001",
|
model="gemini-1.5-pro-001",
|
||||||
|
max_tokens=8192,
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "hello!",
|
"content": "what is the sample text about?",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
temperature="0.7",
|
temperature="0.7",
|
||||||
extra_body={"cached_content": cached_content.resource_name},
|
extra_body={"cached_content": cached_content_name}, # 👈 key change
|
||||||
)
|
)
|
||||||
|
|
||||||
print("response from proxy", response)
|
print("response from proxy", response)
|
||||||
|
|
|
@ -1,38 +1,54 @@
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
|
import httpx
|
||||||
import openai
|
import openai
|
||||||
import vertexai
|
|
||||||
from vertexai.generative_models import Content, Part
|
|
||||||
from vertexai.preview import caching
|
|
||||||
from vertexai.preview.generative_models import GenerativeModel
|
|
||||||
|
|
||||||
client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
|
# Set Litellm proxy variables here
|
||||||
vertexai.init(project="adroit-crow-413218", location="us-central1")
|
LITELLM_BASE_URL = "http://0.0.0.0:4000"
|
||||||
|
LITELLM_PROXY_API_KEY = "sk-1234"
|
||||||
|
|
||||||
|
client = openai.OpenAI(api_key=LITELLM_PROXY_API_KEY, base_url=LITELLM_BASE_URL)
|
||||||
|
httpx_client = httpx.Client(timeout=30)
|
||||||
|
|
||||||
|
################################
|
||||||
|
# First create a cachedContents object
|
||||||
print("creating cached content")
|
print("creating cached content")
|
||||||
contents_here: list[Content] = [
|
create_cache = httpx_client.post(
|
||||||
Content(role="user", parts=[Part.from_text("huge string of text here" * 10000)])
|
url=f"{LITELLM_BASE_URL}/vertex-ai/cachedContents",
|
||||||
]
|
headers={"Authorization": f"Bearer {LITELLM_PROXY_API_KEY}"},
|
||||||
cached_content = caching.CachedContent.create(
|
json={
|
||||||
model_name="gemini-1.5-pro-001",
|
"model": "gemini-1.5-pro-001",
|
||||||
contents=contents_here,
|
"contents": [
|
||||||
expire_time=datetime.datetime(2024, 8, 10),
|
{
|
||||||
|
"role": "user",
|
||||||
|
"parts": [
|
||||||
|
{
|
||||||
|
"text": "This is sample text to demonstrate explicit caching."
|
||||||
|
* 4000
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
print("response from create_cache", create_cache)
|
||||||
|
create_cache_response = create_cache.json()
|
||||||
|
print("json from create_cache", create_cache_response)
|
||||||
|
cached_content_name = create_cache_response["name"]
|
||||||
|
|
||||||
created_Caches = caching.CachedContent.list()
|
#################################
|
||||||
|
# Use the `cachedContents` object in your /chat/completions
|
||||||
print("created_Caches contents=", created_Caches)
|
|
||||||
|
|
||||||
response = client.chat.completions.create( # type: ignore
|
response = client.chat.completions.create( # type: ignore
|
||||||
model="gemini-1.5-pro-001",
|
model="gemini-1.5-pro-001",
|
||||||
max_tokens=8192,
|
max_tokens=8192,
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "quote all everything above this message",
|
"content": "what is the sample text about?",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
temperature="0.7",
|
temperature="0.7",
|
||||||
extra_body={"cached_content": cached_content.resource_name},
|
extra_body={"cached_content": cached_content_name}, # 👈 key change
|
||||||
)
|
)
|
||||||
|
|
||||||
print("response from proxy", response)
|
print("response from proxy", response)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue