From c5ee8024c53fd044fbb8621175c5d882c4f82a51 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 2 Oct 2023 11:46:49 -0700 Subject: [PATCH] fix caching docs on usage --- docs/my-website/docs/caching/caching.md | 55 ++++++++++++++----------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/docs/my-website/docs/caching/caching.md b/docs/my-website/docs/caching/caching.md index 996c5ec7d..cffd1674d 100644 --- a/docs/my-website/docs/caching/caching.md +++ b/docs/my-website/docs/caching/caching.md @@ -18,8 +18,16 @@ from litellm.caching import Cache litellm.cache = Cache() # Make completion calls -response1 = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Tell me a joke."}]) -response2 = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Tell me a joke."}]) +response1 = completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Tell me a joke."}] + caching=True +) +response2 = completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Tell me a joke."}], + caching=True +) # response1 == response2, response 1 is cached ``` @@ -39,8 +47,16 @@ from litellm.caching import Cache litellm.cache = Cache(type="redis", host=, port=, password=) # Make completion calls -response1 = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Tell me a joke."}]) -response2 = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Tell me a joke."}]) +response1 = completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Tell me a joke."}], + caching=True +) +response2 = completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Tell me a joke."}], + caching=True +) # response1 == response2, response 1 is cached ``` @@ -70,21 +86,6 @@ litellm.cache = cache # set litellm.cache to your cache ``` -### Controlling Caching for each litellm.completion call - -`completion()` lets you pass in `caching` (bool) [default False] to control whether to returned cached responses or not - -Using the caching flag -**Ensure you have initialized litellm.cache to your cache object** - -```python -from litellm import completion - -response2 = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.1, caching=True) - -response3 = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.1, caching=False) - -``` ### Detecting Cached Responses For resposes that were returned as cache hit, the response includes a param `cache` = True @@ -115,10 +116,18 @@ from litellm.caching import Cache litellm.cache = Cache() # Make completion calls -response1 = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Tell me a joke."}], stream=True) +response1 = completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Tell me a joke."}], + stream=True, + caching=True) for chunk in response1: print(chunk) -response2 = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Tell me a joke."}], stream=True) +response2 = completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Tell me a joke."}], + stream=True, + caching=True) for chunk in response2: print(chunk) ``` @@ -134,12 +143,12 @@ from litellm.caching import Cache litellm.cache = Cache() start_time = time.time() -embedding1 = embedding(model="text-embedding-ada-002", input=["hello from litellm"*5]) +embedding1 = embedding(model="text-embedding-ada-002", input=["hello from litellm"*5], caching=True) end_time = time.time() print(f"Embedding 1 response time: {end_time - start_time} seconds") start_time = time.time() -embedding2 = embedding(model="text-embedding-ada-002", input=["hello from litellm"*5]) +embedding2 = embedding(model="text-embedding-ada-002", input=["hello from litellm"*5], caching=True) end_time = time.time() print(f"Embedding 2 response time: {end_time - start_time} seconds") ``` \ No newline at end of file