mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 18:54:30 +00:00
docs semantic caching qdrant
This commit is contained in:
parent
b196f41d64
commit
b522ade709
3 changed files with 73 additions and 49 deletions
|
@ -104,6 +104,66 @@ $ litellm --config /path/to/config.yaml
|
||||||
```
|
```
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
|
||||||
|
|
||||||
|
<TabItem value="qdrant-semantic" label="Qdrant Semantic cache">
|
||||||
|
|
||||||
|
Caching can be enabled by adding the `cache` key in the `config.yaml`
|
||||||
|
|
||||||
|
#### Step 1: Add `cache` to the config.yaml
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: fake-openai-endpoint
|
||||||
|
litellm_params:
|
||||||
|
model: openai/fake
|
||||||
|
api_key: fake-key
|
||||||
|
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
||||||
|
- model_name: openai-embedding
|
||||||
|
litellm_params:
|
||||||
|
model: openai/text-embedding-3-small
|
||||||
|
api_key: os.environ/OPENAI_API_KEY
|
||||||
|
|
||||||
|
litellm_settings:
|
||||||
|
set_verbose: True
|
||||||
|
cache: True # set cache responses to True, litellm defaults to using a redis cache
|
||||||
|
cache_params:
|
||||||
|
type: qdrant-semantic
|
||||||
|
qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list
|
||||||
|
qdrant_collection_name: test_collection
|
||||||
|
qdrant_quantization_config: binary
|
||||||
|
similarity_threshold: 0.8 # similarity threshold for semantic cache
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Step 2: Add Qdrant Credentials to your .env
|
||||||
|
|
||||||
|
```shell
|
||||||
|
QDRANT_API_KEY = "16rJUMBRx*************"
|
||||||
|
QDRANT_API_BASE = "https://5392d382-45*********.cloud.qdrant.io"
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Step 3: Run proxy with config
|
||||||
|
```shell
|
||||||
|
$ litellm --config /path/to/config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
#### Step 4. Test it
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl -i http://localhost:4000/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-H "Authorization: Bearer sk-1234" \
|
||||||
|
-d '{
|
||||||
|
"model": "fake-openai-endpoint",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "Hello"}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expect to see `x-litellm-semantic-similarity` in the response headers when semantic caching is one**
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
<TabItem value="s3" label="s3 cache">
|
<TabItem value="s3" label="s3 cache">
|
||||||
|
|
||||||
#### Step 1: Add `cache` to the config.yaml
|
#### Step 1: Add `cache` to the config.yaml
|
||||||
|
@ -185,46 +245,6 @@ $ litellm --config /path/to/config.yaml
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
|
||||||
|
|
||||||
<TabItem value="qdrant-semantic" label="Qdrant Semantic cache">
|
|
||||||
|
|
||||||
Caching can be enabled by adding the `cache` key in the `config.yaml`
|
|
||||||
|
|
||||||
#### Step 1: Add `cache` to the config.yaml
|
|
||||||
```yaml
|
|
||||||
model_list:
|
|
||||||
- model_name: fake-openai-endpoint
|
|
||||||
litellm_params:
|
|
||||||
model: openai/fake
|
|
||||||
api_key: fake-key
|
|
||||||
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
|
||||||
- model_name: openai-embedding
|
|
||||||
litellm_params:
|
|
||||||
model: openai/text-embedding-3-small
|
|
||||||
api_key: os.environ/OPENAI_API_KEY
|
|
||||||
|
|
||||||
litellm_settings:
|
|
||||||
set_verbose: True
|
|
||||||
cache: True # set cache responses to True, litellm defaults to using a redis cache
|
|
||||||
cache_params:
|
|
||||||
type: qdrant-semantic
|
|
||||||
qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list
|
|
||||||
qdrant_collection_name: test_collection
|
|
||||||
qdrant_quantization_config: binary
|
|
||||||
similarity_threshold: 0.8 # similarity threshold for semantic cache
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Step 2: Add Qdrant Credentials to your .env
|
|
||||||
|
|
||||||
```shell
|
|
||||||
QDRANT_API_KEY = "16rJUMBRx*************"
|
|
||||||
QDRANT_API_BASE = "https://5392d382-45*********.cloud.qdrant.io"
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Step 3: Run proxy with config
|
|
||||||
```shell
|
|
||||||
$ litellm --config /path/to/config.yaml
|
|
||||||
```
|
|
||||||
</TabItem>
|
|
||||||
|
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
|
|
|
@ -285,14 +285,18 @@ def get_remaining_tokens_and_requests_from_request_data(data: Dict) -> Dict[str,
|
||||||
return headers
|
return headers
|
||||||
|
|
||||||
|
|
||||||
def get_applied_guardrails_header(request_data: Dict) -> Optional[Dict]:
|
def get_logging_caching_headers(request_data: Dict) -> Optional[Dict]:
|
||||||
_metadata = request_data.get("metadata", None) or {}
|
_metadata = request_data.get("metadata", None) or {}
|
||||||
|
headers = {}
|
||||||
if "applied_guardrails" in _metadata:
|
if "applied_guardrails" in _metadata:
|
||||||
return {
|
headers["x-litellm-applied-guardrails"] = ",".join(
|
||||||
"x-litellm-applied-guardrails": ",".join(_metadata["applied_guardrails"]),
|
_metadata["applied_guardrails"]
|
||||||
}
|
)
|
||||||
|
|
||||||
return None
|
if "semantic-similarity" in _metadata:
|
||||||
|
headers["x-litellm-semantic-similarity"] = str(_metadata["semantic-similarity"])
|
||||||
|
|
||||||
|
return headers
|
||||||
|
|
||||||
|
|
||||||
def add_guardrail_to_applied_guardrails_header(
|
def add_guardrail_to_applied_guardrails_header(
|
||||||
|
|
|
@ -149,7 +149,7 @@ from litellm.proxy.common_utils.admin_ui_utils import (
|
||||||
show_missing_vars_in_env,
|
show_missing_vars_in_env,
|
||||||
)
|
)
|
||||||
from litellm.proxy.common_utils.callback_utils import (
|
from litellm.proxy.common_utils.callback_utils import (
|
||||||
get_applied_guardrails_header,
|
get_logging_caching_headers,
|
||||||
get_remaining_tokens_and_requests_from_request_data,
|
get_remaining_tokens_and_requests_from_request_data,
|
||||||
initialize_callbacks_on_proxy,
|
initialize_callbacks_on_proxy,
|
||||||
)
|
)
|
||||||
|
@ -543,9 +543,9 @@ def get_custom_headers(
|
||||||
)
|
)
|
||||||
headers.update(remaining_tokens_header)
|
headers.update(remaining_tokens_header)
|
||||||
|
|
||||||
applied_guardrails = get_applied_guardrails_header(request_data)
|
logging_caching_headers = get_logging_caching_headers(request_data)
|
||||||
if applied_guardrails:
|
if logging_caching_headers:
|
||||||
headers.update(applied_guardrails)
|
headers.update(logging_caching_headers)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return {
|
return {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue