From d6493b0e7fc1b05b4b218c689c0af7bbfd5db53a Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Wed, 21 Aug 2024 13:03:41 -0700 Subject: [PATCH] docs semantic caching qdrant --- docs/my-website/docs/proxy/caching.md | 100 +++++++++++-------- litellm/proxy/common_utils/callback_utils.py | 14 ++- litellm/proxy/proxy_server.py | 8 +- 3 files changed, 73 insertions(+), 49 deletions(-) diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index 71fae3e805..c2adca88a0 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -104,6 +104,66 @@ $ litellm --config /path/to/config.yaml ``` + + + +Caching can be enabled by adding the `cache` key in the `config.yaml` + +#### Step 1: Add `cache` to the config.yaml +```yaml +model_list: + - model_name: fake-openai-endpoint + litellm_params: + model: openai/fake + api_key: fake-key + api_base: https://exampleopenaiendpoint-production.up.railway.app/ + - model_name: openai-embedding + litellm_params: + model: openai/text-embedding-3-small + api_key: os.environ/OPENAI_API_KEY + +litellm_settings: + set_verbose: True + cache: True # set cache responses to True, litellm defaults to using a redis cache + cache_params: + type: qdrant-semantic + qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list + qdrant_collection_name: test_collection + qdrant_quantization_config: binary + similarity_threshold: 0.8 # similarity threshold for semantic cache +``` + +#### Step 2: Add Qdrant Credentials to your .env + +```shell +QDRANT_API_KEY = "16rJUMBRx*************" +QDRANT_API_BASE = "https://5392d382-45*********.cloud.qdrant.io" +``` + +#### Step 3: Run proxy with config +```shell +$ litellm --config /path/to/config.yaml +``` + + +#### Step 4. Test it + +```shell +curl -i http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "fake-openai-endpoint", + "messages": [ + {"role": "user", "content": "Hello"} + ] + }' +``` + +**Expect to see `x-litellm-semantic-similarity` in the response headers when semantic caching is one** + + + #### Step 1: Add `cache` to the config.yaml @@ -185,46 +245,6 @@ $ litellm --config /path/to/config.yaml - - -Caching can be enabled by adding the `cache` key in the `config.yaml` - -#### Step 1: Add `cache` to the config.yaml -```yaml -model_list: - - model_name: fake-openai-endpoint - litellm_params: - model: openai/fake - api_key: fake-key - api_base: https://exampleopenaiendpoint-production.up.railway.app/ - - model_name: openai-embedding - litellm_params: - model: openai/text-embedding-3-small - api_key: os.environ/OPENAI_API_KEY - -litellm_settings: - set_verbose: True - cache: True # set cache responses to True, litellm defaults to using a redis cache - cache_params: - type: qdrant-semantic - qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list - qdrant_collection_name: test_collection - qdrant_quantization_config: binary - similarity_threshold: 0.8 # similarity threshold for semantic cache -``` - -#### Step 2: Add Qdrant Credentials to your .env - -```shell -QDRANT_API_KEY = "16rJUMBRx*************" -QDRANT_API_BASE = "https://5392d382-45*********.cloud.qdrant.io" -``` - -#### Step 3: Run proxy with config -```shell -$ litellm --config /path/to/config.yaml -``` - diff --git a/litellm/proxy/common_utils/callback_utils.py b/litellm/proxy/common_utils/callback_utils.py index 243ae18135..fa976690e6 100644 --- a/litellm/proxy/common_utils/callback_utils.py +++ b/litellm/proxy/common_utils/callback_utils.py @@ -285,14 +285,18 @@ def get_remaining_tokens_and_requests_from_request_data(data: Dict) -> Dict[str, return headers -def get_applied_guardrails_header(request_data: Dict) -> Optional[Dict]: +def get_logging_caching_headers(request_data: Dict) -> Optional[Dict]: _metadata = request_data.get("metadata", None) or {} + headers = {} if "applied_guardrails" in _metadata: - return { - "x-litellm-applied-guardrails": ",".join(_metadata["applied_guardrails"]), - } + headers["x-litellm-applied-guardrails"] = ",".join( + _metadata["applied_guardrails"] + ) - return None + if "semantic-similarity" in _metadata: + headers["x-litellm-semantic-similarity"] = str(_metadata["semantic-similarity"]) + + return headers def add_guardrail_to_applied_guardrails_header( diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 12069d5e85..a9d0325d80 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -149,7 +149,7 @@ from litellm.proxy.common_utils.admin_ui_utils import ( show_missing_vars_in_env, ) from litellm.proxy.common_utils.callback_utils import ( - get_applied_guardrails_header, + get_logging_caching_headers, get_remaining_tokens_and_requests_from_request_data, initialize_callbacks_on_proxy, ) @@ -543,9 +543,9 @@ def get_custom_headers( ) headers.update(remaining_tokens_header) - applied_guardrails = get_applied_guardrails_header(request_data) - if applied_guardrails: - headers.update(applied_guardrails) + logging_caching_headers = get_logging_caching_headers(request_data) + if logging_caching_headers: + headers.update(logging_caching_headers) try: return {