docs semantic caching qdrant

2025-04-26 03:04:13 +00:00 · 2024-08-21 13:03:41 -07:00 · 2024-08-21 13:03:41 -07:00 · d6493b0e7f
commit d6493b0e7f
parent e7ecb2fe3a
3 changed files with 73 additions and 49 deletions
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@ -104,6 +104,66 @@ $ litellm --config /path/to/config.yaml
 ```
 </TabItem>
 <TabItem value="qdrant-semantic" label="Qdrant Semantic cache">
 Caching can be enabled by adding the `cache` key in the `config.yaml`
 #### Step 1: Add `cache` to the config.yaml
 ```yaml
 model_list:
  - model_name: fake-openai-endpoint
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
  - model_name: openai-embedding
    litellm_params:
      model: openai/text-embedding-3-small
      api_key: os.environ/OPENAI_API_KEY
 litellm_settings:
  set_verbose: True
  cache: True          # set cache responses to True, litellm defaults to using a redis cache
  cache_params:
    type: qdrant-semantic
    qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list
    qdrant_collection_name: test_collection
    qdrant_quantization_config: binary
    similarity_threshold: 0.8   # similarity threshold for semantic cache
 ```
 #### Step 2: Add Qdrant Credentials to your .env
 ```shell
 QDRANT_API_KEY = "16rJUMBRx*************"
 QDRANT_API_BASE = "https://5392d382-45*********.cloud.qdrant.io"
 ```
 #### Step 3: Run proxy with config
 ```shell
 $ litellm --config /path/to/config.yaml
 ```
 #### Step 4. Test it
 ```shell
 curl -i http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{
    "model": "fake-openai-endpoint",
    "messages": [
      {"role": "user", "content": "Hello"}
    ]
  }'
 ```
 **Expect to see `x-litellm-semantic-similarity` in the response headers when semantic caching is one**
 </TabItem>
 <TabItem value="s3" label="s3 cache">
 #### Step 1: Add `cache` to the config.yaml
@ -185,46 +245,6 @@ $ litellm --config /path/to/config.yaml
 </TabItem>
 <TabItem value="qdrant-semantic" label="Qdrant Semantic cache">
 Caching can be enabled by adding the `cache` key in the `config.yaml`
 #### Step 1: Add `cache` to the config.yaml
 ```yaml
 model_list:
  - model_name: fake-openai-endpoint
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
  - model_name: openai-embedding
    litellm_params:
      model: openai/text-embedding-3-small
      api_key: os.environ/OPENAI_API_KEY
 litellm_settings:
  set_verbose: True
  cache: True          # set cache responses to True, litellm defaults to using a redis cache
  cache_params:
    type: qdrant-semantic
    qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list
    qdrant_collection_name: test_collection
    qdrant_quantization_config: binary
    similarity_threshold: 0.8   # similarity threshold for semantic cache
 ```
 #### Step 2: Add Qdrant Credentials to your .env
 ```shell
 QDRANT_API_KEY = "16rJUMBRx*************"
 QDRANT_API_BASE = "https://5392d382-45*********.cloud.qdrant.io"
 ```
 #### Step 3: Run proxy with config
 ```shell
 $ litellm --config /path/to/config.yaml
 ```
 </TabItem>
 </Tabs>
--- a/litellm/proxy/common_utils/callback_utils.py
+++ b/litellm/proxy/common_utils/callback_utils.py
@ -285,14 +285,18 @@ def get_remaining_tokens_and_requests_from_request_data(data: Dict) -> Dict[str,
    return headers
-def get_applied_guardrails_header(request_data: Dict) -> Optional[Dict]:
+def get_logging_caching_headers(request_data: Dict) -> Optional[Dict]:
    _metadata = request_data.get("metadata", None) or {}
    headers = {}
    if "applied_guardrails" in _metadata:
-        return {
+        headers["x-litellm-applied-guardrails"] = ",".join(
-            "x-litellm-applied-guardrails": ",".join(_metadata["applied_guardrails"]),
+            _metadata["applied_guardrails"]
-        }
+        )
-    return None
+    if "semantic-similarity" in _metadata:
        headers["x-litellm-semantic-similarity"] = str(_metadata["semantic-similarity"])
    return headers
 def add_guardrail_to_applied_guardrails_header(
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -149,7 +149,7 @@ from litellm.proxy.common_utils.admin_ui_utils import (
    show_missing_vars_in_env,
 )
 from litellm.proxy.common_utils.callback_utils import (
-    get_applied_guardrails_header,
+    get_logging_caching_headers,
    get_remaining_tokens_and_requests_from_request_data,
    initialize_callbacks_on_proxy,
 )
@ -543,9 +543,9 @@ def get_custom_headers(
        )
        headers.update(remaining_tokens_header)
-        applied_guardrails = get_applied_guardrails_header(request_data)
+        logging_caching_headers = get_logging_caching_headers(request_data)
-        if applied_guardrails:
+        if logging_caching_headers:
-            headers.update(applied_guardrails)
+            headers.update(logging_caching_headers)
    try:
        return {