docs semantic caching qdrant

This commit is contained in:
Ishaan Jaff 2024-08-21 13:03:41 -07:00
parent e7ecb2fe3a
commit d6493b0e7f
3 changed files with 73 additions and 49 deletions

View file

@ -104,6 +104,66 @@ $ litellm --config /path/to/config.yaml
``` ```
</TabItem> </TabItem>
<TabItem value="qdrant-semantic" label="Qdrant Semantic cache">
Caching can be enabled by adding the `cache` key in the `config.yaml`
#### Step 1: Add `cache` to the config.yaml
```yaml
model_list:
- model_name: fake-openai-endpoint
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
- model_name: openai-embedding
litellm_params:
model: openai/text-embedding-3-small
api_key: os.environ/OPENAI_API_KEY
litellm_settings:
set_verbose: True
cache: True # set cache responses to True, litellm defaults to using a redis cache
cache_params:
type: qdrant-semantic
qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list
qdrant_collection_name: test_collection
qdrant_quantization_config: binary
similarity_threshold: 0.8 # similarity threshold for semantic cache
```
#### Step 2: Add Qdrant Credentials to your .env
```shell
QDRANT_API_KEY = "16rJUMBRx*************"
QDRANT_API_BASE = "https://5392d382-45*********.cloud.qdrant.io"
```
#### Step 3: Run proxy with config
```shell
$ litellm --config /path/to/config.yaml
```
#### Step 4. Test it
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "fake-openai-endpoint",
"messages": [
{"role": "user", "content": "Hello"}
]
}'
```
**Expect to see `x-litellm-semantic-similarity` in the response headers when semantic caching is one**
</TabItem>
<TabItem value="s3" label="s3 cache"> <TabItem value="s3" label="s3 cache">
#### Step 1: Add `cache` to the config.yaml #### Step 1: Add `cache` to the config.yaml
@ -185,46 +245,6 @@ $ litellm --config /path/to/config.yaml
</TabItem> </TabItem>
<TabItem value="qdrant-semantic" label="Qdrant Semantic cache">
Caching can be enabled by adding the `cache` key in the `config.yaml`
#### Step 1: Add `cache` to the config.yaml
```yaml
model_list:
- model_name: fake-openai-endpoint
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
- model_name: openai-embedding
litellm_params:
model: openai/text-embedding-3-small
api_key: os.environ/OPENAI_API_KEY
litellm_settings:
set_verbose: True
cache: True # set cache responses to True, litellm defaults to using a redis cache
cache_params:
type: qdrant-semantic
qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list
qdrant_collection_name: test_collection
qdrant_quantization_config: binary
similarity_threshold: 0.8 # similarity threshold for semantic cache
```
#### Step 2: Add Qdrant Credentials to your .env
```shell
QDRANT_API_KEY = "16rJUMBRx*************"
QDRANT_API_BASE = "https://5392d382-45*********.cloud.qdrant.io"
```
#### Step 3: Run proxy with config
```shell
$ litellm --config /path/to/config.yaml
```
</TabItem>
</Tabs> </Tabs>

View file

@ -285,14 +285,18 @@ def get_remaining_tokens_and_requests_from_request_data(data: Dict) -> Dict[str,
return headers return headers
def get_applied_guardrails_header(request_data: Dict) -> Optional[Dict]: def get_logging_caching_headers(request_data: Dict) -> Optional[Dict]:
_metadata = request_data.get("metadata", None) or {} _metadata = request_data.get("metadata", None) or {}
headers = {}
if "applied_guardrails" in _metadata: if "applied_guardrails" in _metadata:
return { headers["x-litellm-applied-guardrails"] = ",".join(
"x-litellm-applied-guardrails": ",".join(_metadata["applied_guardrails"]), _metadata["applied_guardrails"]
} )
return None if "semantic-similarity" in _metadata:
headers["x-litellm-semantic-similarity"] = str(_metadata["semantic-similarity"])
return headers
def add_guardrail_to_applied_guardrails_header( def add_guardrail_to_applied_guardrails_header(

View file

@ -149,7 +149,7 @@ from litellm.proxy.common_utils.admin_ui_utils import (
show_missing_vars_in_env, show_missing_vars_in_env,
) )
from litellm.proxy.common_utils.callback_utils import ( from litellm.proxy.common_utils.callback_utils import (
get_applied_guardrails_header, get_logging_caching_headers,
get_remaining_tokens_and_requests_from_request_data, get_remaining_tokens_and_requests_from_request_data,
initialize_callbacks_on_proxy, initialize_callbacks_on_proxy,
) )
@ -543,9 +543,9 @@ def get_custom_headers(
) )
headers.update(remaining_tokens_header) headers.update(remaining_tokens_header)
applied_guardrails = get_applied_guardrails_header(request_data) logging_caching_headers = get_logging_caching_headers(request_data)
if applied_guardrails: if logging_caching_headers:
headers.update(applied_guardrails) headers.update(logging_caching_headers)
try: try:
return { return {