diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 71fae3e805..c2adca88a0 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -104,6 +104,66 @@ $ litellm --config /path/to/config.yaml
```
+
+
+
+Caching can be enabled by adding the `cache` key in the `config.yaml`
+
+#### Step 1: Add `cache` to the config.yaml
+```yaml
+model_list:
+ - model_name: fake-openai-endpoint
+ litellm_params:
+ model: openai/fake
+ api_key: fake-key
+ api_base: https://exampleopenaiendpoint-production.up.railway.app/
+ - model_name: openai-embedding
+ litellm_params:
+ model: openai/text-embedding-3-small
+ api_key: os.environ/OPENAI_API_KEY
+
+litellm_settings:
+ set_verbose: True
+ cache: True # set cache responses to True, litellm defaults to using a redis cache
+ cache_params:
+ type: qdrant-semantic
+ qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list
+ qdrant_collection_name: test_collection
+ qdrant_quantization_config: binary
+ similarity_threshold: 0.8 # similarity threshold for semantic cache
+```
+
+#### Step 2: Add Qdrant Credentials to your .env
+
+```shell
+QDRANT_API_KEY = "16rJUMBRx*************"
+QDRANT_API_BASE = "https://5392d382-45*********.cloud.qdrant.io"
+```
+
+#### Step 3: Run proxy with config
+```shell
+$ litellm --config /path/to/config.yaml
+```
+
+
+#### Step 4. Test it
+
+```shell
+curl -i http://localhost:4000/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -H "Authorization: Bearer sk-1234" \
+ -d '{
+ "model": "fake-openai-endpoint",
+ "messages": [
+ {"role": "user", "content": "Hello"}
+ ]
+ }'
+```
+
+**Expect to see `x-litellm-semantic-similarity` in the response headers when semantic caching is one**
+
+
+
#### Step 1: Add `cache` to the config.yaml
@@ -185,46 +245,6 @@ $ litellm --config /path/to/config.yaml
-
-
-Caching can be enabled by adding the `cache` key in the `config.yaml`
-
-#### Step 1: Add `cache` to the config.yaml
-```yaml
-model_list:
- - model_name: fake-openai-endpoint
- litellm_params:
- model: openai/fake
- api_key: fake-key
- api_base: https://exampleopenaiendpoint-production.up.railway.app/
- - model_name: openai-embedding
- litellm_params:
- model: openai/text-embedding-3-small
- api_key: os.environ/OPENAI_API_KEY
-
-litellm_settings:
- set_verbose: True
- cache: True # set cache responses to True, litellm defaults to using a redis cache
- cache_params:
- type: qdrant-semantic
- qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list
- qdrant_collection_name: test_collection
- qdrant_quantization_config: binary
- similarity_threshold: 0.8 # similarity threshold for semantic cache
-```
-
-#### Step 2: Add Qdrant Credentials to your .env
-
-```shell
-QDRANT_API_KEY = "16rJUMBRx*************"
-QDRANT_API_BASE = "https://5392d382-45*********.cloud.qdrant.io"
-```
-
-#### Step 3: Run proxy with config
-```shell
-$ litellm --config /path/to/config.yaml
-```
-
diff --git a/litellm/proxy/common_utils/callback_utils.py b/litellm/proxy/common_utils/callback_utils.py
index 243ae18135..fa976690e6 100644
--- a/litellm/proxy/common_utils/callback_utils.py
+++ b/litellm/proxy/common_utils/callback_utils.py
@@ -285,14 +285,18 @@ def get_remaining_tokens_and_requests_from_request_data(data: Dict) -> Dict[str,
return headers
-def get_applied_guardrails_header(request_data: Dict) -> Optional[Dict]:
+def get_logging_caching_headers(request_data: Dict) -> Optional[Dict]:
_metadata = request_data.get("metadata", None) or {}
+ headers = {}
if "applied_guardrails" in _metadata:
- return {
- "x-litellm-applied-guardrails": ",".join(_metadata["applied_guardrails"]),
- }
+ headers["x-litellm-applied-guardrails"] = ",".join(
+ _metadata["applied_guardrails"]
+ )
- return None
+ if "semantic-similarity" in _metadata:
+ headers["x-litellm-semantic-similarity"] = str(_metadata["semantic-similarity"])
+
+ return headers
def add_guardrail_to_applied_guardrails_header(
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 12069d5e85..a9d0325d80 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -149,7 +149,7 @@ from litellm.proxy.common_utils.admin_ui_utils import (
show_missing_vars_in_env,
)
from litellm.proxy.common_utils.callback_utils import (
- get_applied_guardrails_header,
+ get_logging_caching_headers,
get_remaining_tokens_and_requests_from_request_data,
initialize_callbacks_on_proxy,
)
@@ -543,9 +543,9 @@ def get_custom_headers(
)
headers.update(remaining_tokens_header)
- applied_guardrails = get_applied_guardrails_header(request_data)
- if applied_guardrails:
- headers.update(applied_guardrails)
+ logging_caching_headers = get_logging_caching_headers(request_data)
+ if logging_caching_headers:
+ headers.update(logging_caching_headers)
try:
return {