Merge branch 'main' into litellm_exp_mcp_server

2025-04-26 03:04:13 +00:00 · 2025-03-24 19:03:56 -07:00 · 2025-03-24 19:03:56 -07:00 · c6424d6246
commit c6424d6246
parent 019c2350e7 d0834f791a
58 changed files with 2991 additions and 627 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1855,7 +1855,7 @@ jobs:
          command: |
            docker run -d \
              -p 4000:4000 \
-              -e DATABASE_URL=$PROXY_DATABASE_URL \
+              -e DATABASE_URL=$CLEAN_STORE_MODEL_IN_DB_DATABASE_URL \
              -e STORE_MODEL_IN_DB="True" \
              -e LITELLM_MASTER_KEY="sk-1234" \
              -e LITELLM_LICENSE=$LITELLM_LICENSE \
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@ -4,7 +4,8 @@ python-dotenv
 tiktoken
 importlib_metadata
 cohere
-redis
+redis==5.2.1
 redisvl==0.4.1
 anthropic
 orjson==3.9.15
 pydantic==2.10.2
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 .python-version
 .venv
 .env
 .newenv
--- a/3
+++ b/3
@ -37,9 +37,6 @@ RUN pip install dist/*.whl
 # install dependencies as wheels
 RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt
 # install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0 
 RUN pip install redisvl==0.0.7 --no-deps
 # ensure pyjwt is used, not jwt
 RUN pip uninstall jwt -y
 RUN pip uninstall PyJWT -y
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,35 +1,5 @@
 version: "3.11"
 services:
  litellm:
    build:
      context: .
      args:
        target: runtime
    image: ghcr.io/berriai/litellm:main-stable
    #########################################
    ## Uncomment these lines to start proxy with a config.yaml file ##
    # volumes:
    #  - ./config.yaml:/app/config.yaml <<- this is missing in the docker-compose file currently
    # command:
    #  - "--config=/app/config.yaml"
    ##############################################
    ports:
      - "4000:4000" # Map the container port to the host, change the host port if necessary
    environment:
        DATABASE_URL: "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
        STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
    env_file:
      - .env # Load local .env file
    depends_on:
      - db  # Indicates that this service depends on the 'db' service, ensuring 'db' starts first
    healthcheck:  # Defines the health check configuration for the container
      test: [ "CMD", "curl", "-f", "http://localhost:4000/health/liveliness || exit 1" ]  # Command to execute for health check
      interval: 30s  # Perform health check every 30 seconds
      timeout: 10s   # Health check command times out after 10 seconds
      retries: 3     # Retry up to 3 times if health check fails
      start_period: 40s  # Wait 40 seconds after container start before beginning health checks
  db:
    image: postgres:16
    restart: always
@ -46,25 +16,3 @@ services:
      interval: 1s
      timeout: 5s
      retries: 10
  prometheus:
    image: prom/prometheus
    volumes:
      - prometheus_data:/prometheus
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=15d'
    restart: always
 volumes:
  prometheus_data:
    driver: local
  postgres_data:
    name: litellm_postgres_data  # Named volume for Postgres data persistence
 # ...rest of your docker-compose config if any
--- a/docker/Dockerfile.database
+++ b/docker/Dockerfile.database
@ -59,9 +59,6 @@ COPY --from=builder /wheels/ /wheels/
 # Install the built wheel using pip; again using a wildcard if it's the only file
 RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels
 # install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0 
 RUN pip install redisvl==0.0.7 --no-deps
 # ensure pyjwt is used, not jwt
 RUN pip uninstall jwt -y
 RUN pip uninstall PyJWT -y
--- a/docker/Dockerfile.non_root
+++ b/docker/Dockerfile.non_root
@ -14,7 +14,7 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 # Install build dependencies
 RUN apt-get clean && apt-get update && \
-    apt-get install -y gcc python3-dev && \
+    apt-get install -y gcc g++ python3-dev && \
    rm -rf /var/lib/apt/lists/*
 RUN pip install --no-cache-dir --upgrade pip && \
@ -56,10 +56,8 @@ COPY --from=builder /wheels/ /wheels/
 # Install the built wheel using pip; again using a wildcard if it's the only file
 RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels
 # install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0
 # ensure pyjwt is used, not jwt
-RUN pip install redisvl==0.0.7 --no-deps --no-cache-dir && \
+RUN pip uninstall jwt -y && \
    pip uninstall jwt -y && \
    pip uninstall PyJWT -y && \
    pip install PyJWT==2.9.0 --no-cache-dir
--- a/docs/my-website/docs/caching/all_caches.md
+++ b/docs/my-website/docs/caching/all_caches.md
@ -26,7 +26,7 @@ Install redis
 pip install redis
 ```
-For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
+For the hosted version you can setup your own Redis DB here: https://redis.io/try-free/
 ```python
 import litellm
@ -91,12 +91,12 @@ response2 = completion(
 <TabItem value="redis-sem" label="redis-semantic cache">
-Install redis
+Install redisvl client
 ```shell
-pip install redisvl==0.0.7
+pip install redisvl==0.4.1
 ```
-For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
+For the hosted version you can setup your own Redis DB here: https://redis.io/try-free/
 ```python
 import litellm
@ -114,6 +114,7 @@ litellm.cache = Cache(
    port=os.environ["REDIS_PORT"],
    password=os.environ["REDIS_PASSWORD"],
    similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity
    ttl=120,
    redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
 )
 response1 = completion(
@ -471,11 +472,13 @@ def __init__(
    password: Optional[str] = None,
    namespace: Optional[str] = None,
    default_in_redis_ttl: Optional[float] = None,
    similarity_threshold: Optional[float] = None,
    redis_semantic_cache_use_async=False,
    redis_semantic_cache_embedding_model="text-embedding-ada-002",
    redis_flush_size=None,
    # redis semantic cache params
    similarity_threshold: Optional[float] = None,
    redis_semantic_cache_embedding_model: str = "text-embedding-ada-002",
    redis_semantic_cache_index_name: Optional[str] = None,
    # s3 Bucket, boto3 configuration
    s3_bucket_name: Optional[str] = None,
    s3_region_name: Optional[str] = None,
--- a/docs/my-website/docs/completion/document_understanding.md
+++ b/docs/my-website/docs/completion/document_understanding.md
@ -200,3 +200,92 @@ Expected Response
 </TabItem>
 </Tabs>
 ## OpenAI 'file' message type
 This is currently only supported for OpenAI models. 
 This will be supported for all providers soon. 
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 import base64
 from litellm import completion
 with open("draconomicon.pdf", "rb") as f:
    data = f.read()
 base64_string = base64.b64encode(data).decode("utf-8")
 completion = completion(
    model="gpt-4o",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "file",
                    "file": {
                        "filename": "draconomicon.pdf",
                        "file_data": f"data:application/pdf;base64,{base64_string}",
                    }
                },
                {
                    "type": "text",
                    "text": "What is the first dragon in the book?",
                }
            ],
        },
    ],
 )
 print(completion.choices[0].message.content)
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Setup config.yaml
 ```yaml
 model_list:
  - model_name: openai-model
    litellm_params:
      model: gpt-4o
      api_key: os.environ/OPENAI_API_KEY
 ```
 2. Start the proxy
 ```bash
 litellm --config config.yaml
 ```
 3. Test it!
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -d '{ 
    "model": "openai-model",
    "messages": [
        {"role": "user", "content": [
            {
                "type": "file",
                "file": {
                    "filename": "draconomicon.pdf",
                    "file_data": f"data:application/pdf;base64,{base64_string}",
                }
            }
        ]}
    ]
 }'
 ```
 </TabItem>
 </Tabs>
--- a/docs/my-website/docs/completion/web_search.md
+++ b/docs/my-website/docs/completion/web_search.md
@ -0,0 +1,308 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Using Web Search
 Use web search with litellm
 | Feature | Details |
 |---------|---------|
 | Supported Endpoints | - `/chat/completions` <br/> - `/responses` |
 | Supported Providers | `openai` |
 | LiteLLM Cost Tracking | ✅ Supported |
 | LiteLLM Version | `v1.63.15-nightly` or higher |
 ## `/chat/completions` (litellm.completion)
 ### Quick Start
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python showLineNumbers
 from litellm import completion
 response = completion(
    model="openai/gpt-4o-search-preview",
    messages=[
        {
            "role": "user",
            "content": "What was a positive news story from today?",
        }
    ],
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Setup config.yaml
 ```yaml
 model_list:
  - model_name: gpt-4o-search-preview
    litellm_params:
      model: openai/gpt-4o-search-preview
      api_key: os.environ/OPENAI_API_KEY
 ```
 2. Start the proxy 
 ```bash
 litellm --config /path/to/config.yaml
 ```
 3. Test it! 
 ```python showLineNumbers
 from openai import OpenAI
 # Point to your proxy server
 client = OpenAI(
    api_key="sk-1234",
    base_url="http://0.0.0.0:4000"
 )
 response = client.chat.completions.create(
    model="gpt-4o-search-preview",
    messages=[
        {
            "role": "user",
            "content": "What was a positive news story from today?"
        }
    ]
 )
 ```
 </TabItem>
 </Tabs>
 ### Search context size
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python showLineNumbers
 from litellm import completion
 # Customize search context size
 response = completion(
    model="openai/gpt-4o-search-preview",
    messages=[
        {
            "role": "user",
            "content": "What was a positive news story from today?",
        }
    ],
    web_search_options={
        "search_context_size": "low"  # Options: "low", "medium" (default), "high"
    }
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 ```python showLineNumbers
 from openai import OpenAI
 # Point to your proxy server
 client = OpenAI(
    api_key="sk-1234",
    base_url="http://0.0.0.0:4000"
 )
 # Customize search context size
 response = client.chat.completions.create(
    model="gpt-4o-search-preview",
    messages=[
        {
            "role": "user",
            "content": "What was a positive news story from today?"
        }
    ],
    web_search_options={
        "search_context_size": "low"  # Options: "low", "medium" (default), "high"
    }
 )
 ```
 </TabItem>
 </Tabs>
 ## `/responses` (litellm.responses)
 ### Quick Start
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python showLineNumbers
 from litellm import responses
 response = responses(
    model="openai/gpt-4o",
    input=[
        {
            "role": "user",
            "content": "What was a positive news story from today?"
        }
    ],
    tools=[{
        "type": "web_search_preview"  # enables web search with default medium context size
    }]
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Setup config.yaml
 ```yaml
 model_list:
  - model_name: gpt-4o
    litellm_params:
      model: openai/gpt-4o
      api_key: os.environ/OPENAI_API_KEY
 ```
 2. Start the proxy 
 ```bash
 litellm --config /path/to/config.yaml
 ```
 3. Test it! 
 ```python showLineNumbers
 from openai import OpenAI
 # Point to your proxy server
 client = OpenAI(
    api_key="sk-1234",
    base_url="http://0.0.0.0:4000"
 )
 response = client.responses.create(
    model="gpt-4o",
    tools=[{
        "type": "web_search_preview"
    }],
    input="What was a positive news story from today?",
 )
 print(response.output_text)
 ```
 </TabItem>
 </Tabs>
 ### Search context size
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python showLineNumbers
 from litellm import responses
 # Customize search context size
 response = responses(
    model="openai/gpt-4o",
    input=[
        {
            "role": "user",
            "content": "What was a positive news story from today?"
        }
    ],
    tools=[{
        "type": "web_search_preview",
        "search_context_size": "low"  # Options: "low", "medium" (default), "high"
    }]
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 ```python showLineNumbers
 from openai import OpenAI
 # Point to your proxy server
 client = OpenAI(
    api_key="sk-1234",
    base_url="http://0.0.0.0:4000"
 )
 # Customize search context size
 response = client.responses.create(
    model="gpt-4o",
    tools=[{
        "type": "web_search_preview",
        "search_context_size": "low"  # Options: "low", "medium" (default), "high"
    }],
    input="What was a positive news story from today?",
 )
 print(response.output_text)
 ```
 </TabItem>
 </Tabs>
 ## Checking if a model supports web search
 <Tabs>
 <TabItem label="SDK" value="sdk">
 Use `litellm.supports_web_search(model="openai/gpt-4o-search-preview")` -> returns `True` if model can perform web searches
 ```python showLineNumbers
 assert litellm.supports_web_search(model="openai/gpt-4o-search-preview") == True
 ```
 </TabItem>
 <TabItem label="PROXY" value="proxy">
 1. Define OpenAI models in config.yaml
 ```yaml
 model_list:
  - model_name: gpt-4o-search-preview
    litellm_params:
      model: openai/gpt-4o-search-preview
      api_key: os.environ/OPENAI_API_KEY
    model_info:
      supports_web_search: True
 ```
 2. Run proxy server
 ```bash
 litellm --config config.yaml
 ```
 3. Call `/model_group/info` to check if a model supports web search
 ```shell
 curl -X 'GET' \
  'http://localhost:4000/model_group/info' \
  -H 'accept: application/json' \
  -H 'x-api-key: sk-1234'
 ```
 Expected Response 
 ```json showLineNumbers
 {
  "data": [
    {
      "model_group": "gpt-4o-search-preview",
      "providers": ["openai"],
      "max_tokens": 128000,
      "supports_web_search": true, # 👈 supports_web_search is true
    }
  ]
 }
 ```
 </TabItem>
 </Tabs>
--- a/docs/my-website/docs/guides/security_settings.md
+++ b/docs/my-website/docs/guides/security_settings.md
@ -0,0 +1,66 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # SSL Security Settings
 If you're in an environment using an older TTS bundle, with an older encryption, follow this guide. 
 LiteLLM uses HTTPX for network requests, unless otherwise specified. 
 1. Disable SSL verification
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 import litellm
 litellm.ssl_verify = False
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 ```yaml
 litellm_settings:
  ssl_verify: false
 ```
 </TabItem>  
 <TabItem value="env_var" label="Environment Variables">
 ```bash
 export SSL_VERIFY="False"
 ```
 </TabItem>
 </Tabs>
 2. Lower security settings
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 import litellm
 litellm.ssl_security_level = 1
 litellm.ssl_certificate = "/path/to/certificate.pem"
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 ```yaml
 litellm_settings:
  ssl_security_level: 1
  ssl_certificate: "/path/to/certificate.pem"
 ```
 </TabItem>
 <TabItem value="env_var" label="Environment Variables">
 ```bash
 export SSL_SECURITY_LEVEL="1"
 export SSL_CERTIFICATE="/path/to/certificate.pem"
 ```
 </TabItem>
 </Tabs>
--- a/docs/my-website/docs/observability/arize_integration.md
+++ b/docs/my-website/docs/observability/arize_integration.md
@ -1,4 +1,7 @@
 import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Arize AI
@ -11,6 +14,8 @@ https://github.com/BerriAI/litellm
 :::
 <Image img={require('../../img/arize.png')} />
 ## Pre-Requisites
@ -24,7 +29,9 @@ You can also use the instrumentor option instead of the callback, which you can
 ```python
 litellm.callbacks = ["arize"]
 ```
 ```python
 import litellm
 import os
@ -48,7 +55,7 @@ response = litellm.completion(
 ### Using with LiteLLM Proxy
-
+1. Setup config.yaml
 ```yaml
 model_list:
  - model_name: gpt-4
@ -60,13 +67,134 @@ model_list:
 litellm_settings:
  callbacks: ["arize"]
 general_settings:
  master_key: "sk-1234" # can also be set as an environment variable
 environment_variables:
    ARIZE_SPACE_KEY: "d0*****"
    ARIZE_API_KEY: "141a****"
    ARIZE_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize GRPC api endpoint
-    ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT
+    ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT or Neither (defaults to https://otlp.arize.com/v1 on grpc)
 ```
 2. Start the proxy
 ```bash
 litellm --config config.yaml
 ```
 3. Test it!
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -d '{ "model": "gpt-4", "messages": [{"role": "user", "content": "Hi 👋 - i'm openai"}]}'
 ```
 ## Pass Arize Space/Key per-request
 Supported parameters:
 - `arize_api_key`
 - `arize_space_key`
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 import litellm
 import os
 # LLM API Keys
 os.environ['OPENAI_API_KEY']=""
 # set arize as a callback, litellm will send the data to arize
 litellm.callbacks = ["arize"]
 # openai call
 response = litellm.completion(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", "content": "Hi 👋 - i'm openai"}
  ],
  arize_api_key=os.getenv("ARIZE_SPACE_2_API_KEY"),
  arize_space_key=os.getenv("ARIZE_SPACE_2_KEY"),
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Setup config.yaml
 ```yaml
 model_list:
  - model_name: gpt-4
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
 litellm_settings:
  callbacks: ["arize"]
 general_settings:
  master_key: "sk-1234" # can also be set as an environment variable
 ```
 2. Start the proxy
 ```bash
 litellm --config /path/to/config.yaml
 ```
 3. Test it!
 <Tabs>
 <TabItem value="curl" label="CURL">
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -d '{
  "model": "gpt-4",
  "messages": [{"role": "user", "content": "Hi 👋 - i'm openai"}],
  "arize_api_key": "ARIZE_SPACE_2_API_KEY",
  "arize_space_key": "ARIZE_SPACE_2_KEY"
 }'
 ```
 </TabItem>
 <TabItem value="openai_python" label="OpenAI Python">
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 # request sent to model set on litellm proxy, `litellm --model`
 response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages = [
        {
            "role": "user",
            "content": "this is a test request, write a short poem"
        }
    ],
    extra_body={
      "arize_api_key": "ARIZE_SPACE_2_API_KEY",
      "arize_space_key": "ARIZE_SPACE_2_KEY"
    }
 )
 print(response)
 ```
 </TabItem>
 </Tabs>
 </TabItem>
 </Tabs>
 ## Support & Talk to Founders
 - [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
--- a/docs/my-website/docs/providers/azure.md
+++ b/docs/my-website/docs/providers/azure.md
@ -291,14 +291,15 @@ response = completion(
 )
 ```
-## Azure O1 Models
+## O-Series Models
-| Model Name          | Function Call                                      |
+Azure OpenAI O-Series models are supported on LiteLLM. 
 |---------------------|----------------------------------------------------|
 | o1-mini | `response = completion(model="azure/<your deployment name>", messages=messages)` |
 | o1-preview | `response = completion(model="azure/<your deployment name>", messages=messages)` |
-Set `litellm.enable_preview_features = True` to use Azure O1 Models with streaming support. 
+LiteLLM routes any deployment name with `o1` or `o3` in the model name, to the O-Series [transformation](https://github.com/BerriAI/litellm/blob/91ed05df2962b8eee8492374b048d27cc144d08c/litellm/llms/azure/chat/o1_transformation.py#L4) logic.
 To set this explicitly, set `model` to `azure/o_series/<your-deployment-name>`.
 **Automatic Routing**
 <Tabs>
 <TabItem value="sdk" label="SDK">
@ -306,32 +307,88 @@ Set `litellm.enable_preview_features = True` to use Azure O1 Models with streami
 ```python
 import litellm
-litellm.enable_preview_features = True # 👈 KEY CHANGE
+litellm.completion(model="azure/my-o3-deployment", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o3' in the deployment name
 response = litellm.completion(
    model="azure/<your deployment name>",
    messages=[{"role": "user", "content": "What is the weather like in Boston?"}],
    stream=True
 )
 for chunk in response:
    print(chunk)
 ```
 </TabItem>
-<TabItem value="proxy" label="Proxy">
+<TabItem value="proxy" label="PROXY">
 1. Setup config.yaml
 ```yaml
 model_list:
-  - model_name: o1-mini
+  - model_name: o3-mini
    litellm_params:
-      model: azure/o1-mini
+      model: azure/o3-model
-      api_base: "os.environ/AZURE_API_BASE"
+      api_base: os.environ/AZURE_API_BASE
-      api_key: "os.environ/AZURE_API_KEY"
+      api_key: os.environ/AZURE_API_KEY
-      api_version: "os.environ/AZURE_API_VERSION"
+```
-litellm_settings:
+</TabItem>
-    enable_preview_features: true # 👈 KEY CHANGE
+</Tabs>
 **Explicit Routing**
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 import litellm
 litellm.completion(model="azure/o_series/my-random-deployment-name", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o_series/' in the deployment name
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 ```yaml
 model_list:
  - model_name: o3-mini
    litellm_params:
      model: azure/o_series/my-random-deployment-name
      api_base: os.environ/AZURE_API_BASE
      api_key: os.environ/AZURE_API_KEY
 ```
 </TabItem>
 </Tabs>
 ## Azure Audio Model
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import completion
 import os
 os.environ["AZURE_API_KEY"] = ""
 os.environ["AZURE_API_BASE"] = ""
 os.environ["AZURE_API_VERSION"] = ""
 response = completion(
    model="azure/azure-openai-4o-audio",
    messages=[
      {
        "role": "user",
        "content": "I want to try out speech to speech"
      }
    ],
    modalities=["text","audio"],
    audio={"voice": "alloy", "format": "wav"}
 )
 print(response)
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Setup config.yaml
 ```yaml
 model_list:
  - model_name: azure-openai-4o-audio
    litellm_params:
      model: azure/azure-openai-4o-audio
      api_base: os.environ/AZURE_API_BASE
      api_key: os.environ/AZURE_API_KEY
      api_version: os.environ/AZURE_API_VERSION
 ```
 2. Start proxy
@ -340,26 +397,22 @@ litellm_settings:
 litellm --config /path/to/config.yaml
 ```
-3. Test it 
+3. Test it!
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
-response = client.chat.completions.create(model="o1-mini", messages = [
+```bash
-    {
+curl http://localhost:4000/v1/chat/completions \
-        "role": "user",
+  -H "Authorization: Bearer $LITELLM_API_KEY" \
-        "content": "this is a test request, write a short poem"
+  -H "Content-Type: application/json" \
-    }
+  -d '{
-],
+    "model": "azure-openai-4o-audio",
-stream=True)
+    "messages": [{"role": "user", "content": "I want to try out speech to speech"}],
-
+    "modalities": ["text","audio"],
-for chunk in response:
+    "audio": {"voice": "alloy", "format": "wav"}
-    print(chunk)
+  }'
 ```
 </TabItem>
 </Tabs>
@ -948,62 +1001,9 @@ Expected Response:
 {"data":[{"id":"batch_R3V...}
 ```
 ## O-Series Models
 Azure OpenAI O-Series models are supported on LiteLLM. 
 LiteLLM routes any deployment name with `o1` or `o3` in the model name, to the O-Series [transformation](https://github.com/BerriAI/litellm/blob/91ed05df2962b8eee8492374b048d27cc144d08c/litellm/llms/azure/chat/o1_transformation.py#L4) logic.
 To set this explicitly, set `model` to `azure/o_series/<your-deployment-name>`.
 **Automatic Routing**
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 import litellm
 litellm.completion(model="azure/my-o3-deployment", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o3' in the deployment name
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 ```yaml
 model_list:
  - model_name: o3-mini
    litellm_params:
      model: azure/o3-model
      api_base: os.environ/AZURE_API_BASE
      api_key: os.environ/AZURE_API_KEY
 ```
 </TabItem>
 </Tabs>
 **Explicit Routing**
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 import litellm
 litellm.completion(model="azure/o_series/my-random-deployment-name", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o_series/' in the deployment name
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 ```yaml
 model_list:
  - model_name: o3-mini
    litellm_params:
      model: azure/o_series/my-random-deployment-name
      api_base: os.environ/AZURE_API_BASE
      api_key: os.environ/AZURE_API_KEY
 ```
 </TabItem>
 </Tabs>
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@ -1428,10 +1428,14 @@ response = litellm.embedding(
 ## Supported AWS Bedrock Models
 LiteLLM supports ALL Bedrock models. 
 Here's an example of using a bedrock model with LiteLLM. For a complete list, refer to the [model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
 | Model Name                 | Command                                                          |
 |----------------------------|------------------------------------------------------------------|
 | Deepseek R1    | `completion(model='bedrock/us.deepseek.r1-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-V3.5 Sonnet    | `completion(model='bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-V3  sonnet    | `completion(model='bedrock/anthropic.claude-3-sonnet-20240229-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-V3 Haiku     | `completion(model='bedrock/anthropic.claude-3-haiku-20240307-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
--- a/docs/my-website/docs/providers/ollama.md
+++ b/docs/my-website/docs/providers/ollama.md
@ -202,6 +202,67 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 </TabItem>
 </Tabs>
 ## Using Ollama FIM on `/v1/completions`
 LiteLLM supports calling Ollama's `/api/generate` endpoint on `/v1/completions` requests. 
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 import litellm 
 litellm._turn_on_debug() # turn on debug to see the request
 from litellm import completion
 response = completion(
    model="ollama/llama3.1",
    prompt="Hello, world!",
    api_base="http://localhost:11434"
 )
 print(response)
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Setup config.yaml 
 ```yaml
 model_list:
  - model_name: "llama3.1"             
    litellm_params:
      model: "ollama/llama3.1"
      api_base: "http://localhost:11434"
 ```
 2. Start proxy 
 ```bash
 litellm --config /path/to/config.yaml --detailed_debug
 # RUNNING ON http://0.0.0.0:4000 
 ```
 3. Test it! 
 ```python
 from openai import OpenAI
 client = OpenAI(
    api_key="anything", # 👈 PROXY KEY (can be anything, if master_key not set)
    base_url="http://0.0.0.0:4000" # 👈 PROXY BASE URL
 )
 response = client.completions.create(
    model="ollama/llama3.1",
    prompt="Hello, world!",
    api_base="http://localhost:11434"
 )
 print(response)
 ```
 </TabItem>
 </Tabs>
 ## Using ollama `api/chat` 
 In order to send ollama requests to `POST /api/chat` on your ollama server, set the model prefix to `ollama_chat`
--- a/docs/my-website/docs/providers/openai.md
+++ b/docs/my-website/docs/providers/openai.md
@ -228,6 +228,92 @@ response = completion(
 ```
 ## PDF File Parsing
 OpenAI has a new `file` message type that allows you to pass in a PDF file and have it parsed into a structured output. [Read more](https://platform.openai.com/docs/guides/pdf-files?api-mode=chat&lang=python)
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 import base64
 from litellm import completion
 with open("draconomicon.pdf", "rb") as f:
    data = f.read()
 base64_string = base64.b64encode(data).decode("utf-8")
 completion = completion(
    model="gpt-4o",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "file",
                    "file": {
                        "filename": "draconomicon.pdf",
                        "file_data": f"data:application/pdf;base64,{base64_string}",
                    }
                },
                {
                    "type": "text",
                    "text": "What is the first dragon in the book?",
                }
            ],
        },
    ],
 )
 print(completion.choices[0].message.content)
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Setup config.yaml
 ```yaml
 model_list:
  - model_name: openai-model
    litellm_params:
      model: gpt-4o
      api_key: os.environ/OPENAI_API_KEY
 ```
 2. Start the proxy
 ```bash
 litellm --config config.yaml
 ```
 3. Test it!
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -d '{ 
    "model": "openai-model",
    "messages": [
        {"role": "user", "content": [
            {
                "type": "file",
                "file": {
                    "filename": "draconomicon.pdf",
                    "file_data": f"data:application/pdf;base64,{base64_string}",
                }
            }
        ]}
    ]
 }'
 ```
 </TabItem>
 </Tabs>
 ## OpenAI Fine Tuned Models
 | Model Name                | Function Call                                                          |
@ -449,26 +535,6 @@ response = litellm.acompletion(
 )
 ```
 ### Using Helicone Proxy with LiteLLM
 ```python
 import os 
 import litellm
 from litellm import completion
 os.environ["OPENAI_API_KEY"] = ""
 # os.environ["OPENAI_API_BASE"] = ""
 litellm.api_base = "https://oai.hconeai.com/v1"
 litellm.headers = {
    "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}",
    "Helicone-Cache-Enabled": "true",
 }
 messages = [{ "content": "Hello, how are you?","role": "user"}]
 # openai call
 response = completion("gpt-3.5-turbo", messages)
 ```
 ### Using OpenAI Proxy with LiteLLM
 ```python
--- a/docs/my-website/docs/providers/openrouter.md
+++ b/docs/my-website/docs/providers/openrouter.md
@ -10,9 +10,11 @@ LiteLLM supports all the text / chat / vision models from [OpenRouter](https://o
 import os
 from litellm import completion
 os.environ["OPENROUTER_API_KEY"] = ""
 os.environ["OPENROUTER_API_BASE"] = "" # [OPTIONAL] defaults to https://openrouter.ai/api/v1
-os.environ["OR_SITE_URL"] = "" # optional
+
-os.environ["OR_APP_NAME"] = "" # optional
+os.environ["OR_SITE_URL"] = "" # [OPTIONAL]
 os.environ["OR_APP_NAME"] = "" # [OPTIONAL]
 response = completion(
            model="openrouter/google/palm-2-chat-bison",
--- a/docs/my-website/docs/proxy/call_hooks.md
+++ b/docs/my-website/docs/proxy/call_hooks.md
@ -70,6 +70,21 @@ class MyCustomHandler(CustomLogger): # https://docs.litellm.ai/docs/observabilit
        response: str,
    ):
        pass
    aasync def async_post_call_streaming_iterator_hook(
        self,
        user_api_key_dict: UserAPIKeyAuth,
        response: Any,
        request_data: dict,
    ) -> AsyncGenerator[ModelResponseStream, None]:
        """
        Passes the entire stream to the guardrail
        This is useful for plugins that need to see the entire stream.
        """
        async for item in response:
            yield item
 proxy_handler_instance = MyCustomHandler()
 ```
--- a/docs/my-website/docs/proxy/config_settings.md
+++ b/docs/my-website/docs/proxy/config_settings.md
@ -147,6 +147,7 @@ general_settings:
 |------|------|-------------|
 | completion_model | string | The default model to use for completions when `model` is not specified in the request |
 | disable_spend_logs | boolean | If true, turns off writing each transaction to the database |
 | disable_spend_updates | boolean | If true, turns off all spend updates to the DB. Including key/user/team spend updates. |
 | disable_master_key_return | boolean | If true, turns off returning master key on UI. (checked on '/user/info' endpoint) |
 | disable_retry_on_max_parallel_request_limit_error | boolean | If true, turns off retries when max parallel request limit is reached |
 | disable_reset_budget | boolean | If true, turns off reset budget scheduled task |
--- a/docs/my-website/docs/proxy/guardrails/custom_guardrail.md
+++ b/docs/my-website/docs/proxy/guardrails/custom_guardrail.md
@ -10,10 +10,12 @@ Use this is you want to write code to run a custom guardrail
 ### 1. Write a `CustomGuardrail` Class
-A CustomGuardrail has 3 methods to enforce guardrails 
+A CustomGuardrail has 4 methods to enforce guardrails 
 - `async_pre_call_hook` - (Optional) modify input or reject request before making LLM API call
 - `async_moderation_hook` - (Optional) reject request, runs while making LLM API call (help to lower latency)
 - `async_post_call_success_hook`- (Optional) apply guardrail on input/output, runs after making LLM API call
 - `async_post_call_streaming_iterator_hook` - (Optional) pass the entire stream to the guardrail
 **[See detailed spec of methods here](#customguardrail-methods)**
@ -128,6 +130,23 @@ class myCustomGuardrail(CustomGuardrail):
                    ):
                        raise ValueError("Guardrail failed Coffee Detected")
    async def async_post_call_streaming_iterator_hook(
        self,
        user_api_key_dict: UserAPIKeyAuth,
        response: Any,
        request_data: dict,
    ) -> AsyncGenerator[ModelResponseStream, None]:
        """
        Passes the entire stream to the guardrail
        This is useful for guardrails that need to see the entire response, such as PII masking.
        See Aim guardrail implementation for an example - https://github.com/BerriAI/litellm/blob/d0e022cfacb8e9ebc5409bb652059b6fd97b45c0/litellm/proxy/guardrails/guardrail_hooks/aim.py#L168
        Triggered by mode: 'post_call'
        """
        async for item in response:
            yield item
 ```
--- a/docs/my-website/docs/proxy/logging_spec.md
+++ b/docs/my-website/docs/proxy/logging_spec.md
@ -79,6 +79,7 @@ Inherits from `StandardLoggingUserAPIKeyMetadata` and adds:
 | `response_cost` | `Optional[str]` | Optional response cost |
 | `additional_headers` | `Optional[StandardLoggingAdditionalHeaders]` | Additional headers |
 | `batch_models` | `Optional[List[str]]` | Only set for Batches API. Lists the models used for cost calculation |
 | `litellm_model_name` | `Optional[str]` | Model name sent in request |
 ## StandardLoggingModelInformation
--- a/docs/my-website/docs/proxy/response_headers.md
+++ b/docs/my-website/docs/proxy/response_headers.md
@ -43,19 +43,19 @@ These headers are useful for clients to understand the current rate limit status
 | `x-litellm-max-fallbacks` | int | Maximum number of fallback attempts allowed |
 ## Cost Tracking Headers
-| Header | Type | Description |
+| Header | Type | Description | Available on Pass-Through Endpoints |
-|--------|------|-------------|
+|--------|------|-------------|-------------|
-| `x-litellm-response-cost` | float | Cost of the API call |
+| `x-litellm-response-cost` | float | Cost of the API call | |
-| `x-litellm-key-spend` | float | Total spend for the API key |
+| `x-litellm-key-spend` | float | Total spend for the API key | ✅ |
 ## LiteLLM Specific Headers
-| Header | Type | Description |
+| Header | Type | Description | Available on Pass-Through Endpoints |
-|--------|------|-------------|
+|--------|------|-------------|-------------|
-| `x-litellm-call-id` | string | Unique identifier for the API call |
+| `x-litellm-call-id` | string | Unique identifier for the API call | ✅ |
-| `x-litellm-model-id` | string | Unique identifier for the model used |
+| `x-litellm-model-id` | string | Unique identifier for the model used | |
-| `x-litellm-model-api-base` | string | Base URL of the API endpoint |
+| `x-litellm-model-api-base` | string | Base URL of the API endpoint | ✅ |
-| `x-litellm-version` | string | Version of LiteLLM being used |
+| `x-litellm-version` | string | Version of LiteLLM being used | |
-| `x-litellm-model-group` | string | Model group identifier |
+| `x-litellm-model-group` | string | Model group identifier | |
 ## Response headers from LLM providers
--- a/docs/my-website/img/arize.png
+++ b/docs/my-website/img/arize.png
--- a/docs/my-website/release_notes/v1.63.11-stable/index.md
+++ b/docs/my-website/release_notes/v1.63.11-stable/index.md
@ -26,14 +26,6 @@ This release is primarily focused on:
 - UI - Credential Management, re-use credentials when adding new models
 - UI - Test Connection to LLM Provider before adding a model
 :::info
 This release will be live on 03/16/2025
 :::
 <!-- <Image img={require('../../img/release_notes/v16311_release.jpg')} /> -->
 ## Known Issues
 - 🚨 Known issue on Azure OpenAI - We don't recommend upgrading if you use Azure OpenAI. This version failed our Azure OpenAI load test
--- a/docs/my-website/release_notes/v1.63.14/index.md
+++ b/docs/my-website/release_notes/v1.63.14/index.md
@ -0,0 +1,130 @@
 ---
 title: v1.63.14-stable
 slug: v1.63.14-stable
 date: 2025-03-22T10:00:00
 authors:
  - name: Krrish Dholakia
    title: CEO, LiteLLM
    url: https://www.linkedin.com/in/krish-d/
    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
  - name: Ishaan Jaffer
    title: CTO, LiteLLM
    url: https://www.linkedin.com/in/reffajnaahsi/
    image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg
 tags: [credential management, thinking content, responses api, snowflake]
 hide_table_of_contents: false
 ---
 import Image from '@theme/IdealImage';
 These are the changes since `v1.63.11-stable`.
 This release brings:
 - LLM Translation Improvements (MCP Support and Bedrock Application Profiles)
 - Perf improvements for Usage-based Routing
 - Streaming guardrail support via websockets
 ## Docker Run LiteLLM Proxy
 ```
 docker run
 -e STORE_MODEL_IN_DB=True
 -p 4000:4000
 ghcr.io/berriai/litellm:main-v1.63.14-stable
 ```
 ## Demo Instance
 Here's a Demo Instance to test changes:
 - Instance: https://demo.litellm.ai/
 - Login Credentials:
    - Username: admin
    - Password: sk-1234
 ## New Models / Updated Models
 - Azure gpt-4o - fixed pricing to latest global pricing - [PR](https://github.com/BerriAI/litellm/pull/9361)
 - O1-Pro - add pricing + model information - [PR](https://github.com/BerriAI/litellm/pull/9397)
 - Azure AI - mistral 3.1 small pricing added - [PR](https://github.com/BerriAI/litellm/pull/9453)
 - Azure - gpt-4.5-preview pricing added - [PR](https://github.com/BerriAI/litellm/pull/9453)
 ## LLM Translation
 1. **New LLM Features**
 - Bedrock: Support bedrock application inference profiles [Docs](https://docs.litellm.ai/docs/providers/bedrock#bedrock-application-inference-profile)
   - Infer aws region from bedrock application profile id - (`arn:aws:bedrock:us-east-1:...`)
 - Ollama - support calling via `/v1/completions` [Get Started](../../docs/providers/ollama#using-ollama-fim-on-v1completions)
 - Bedrock - support `us.deepseek.r1-v1:0` model name [Docs](../../docs/providers/bedrock#supported-aws-bedrock-models)
 - OpenRouter - `OPENROUTER_API_BASE` env var support [Docs](../../docs/providers/openrouter.md)
 - Azure - add audio model parameter support - [Docs](../../docs/providers/azure#azure-audio-model)
 - OpenAI - PDF File support [Docs](../../docs/completion/document_understanding#openai-file-message-type)
 - OpenAI - o1-pro Responses API streaming support [Docs](../../docs/response_api.md#streaming)
 - [BETA] MCP - Use MCP Tools with LiteLLM SDK [Docs](../../docs/mcp)
 2. **Bug Fixes**
 - Voyage: prompt token on embedding tracking fix - [PR](https://github.com/BerriAI/litellm/commit/56d3e75b330c3c3862dc6e1c51c1210e48f1068e)
 - Sagemaker - Fix ‘Too little data for declared Content-Length’ error - [PR](https://github.com/BerriAI/litellm/pull/9326)
 - OpenAI-compatible models - fix issue when calling openai-compatible models w/ custom_llm_provider set - [PR](https://github.com/BerriAI/litellm/pull/9355)
 - VertexAI - Embedding ‘outputDimensionality’ support - [PR](https://github.com/BerriAI/litellm/commit/437dbe724620675295f298164a076cbd8019d304)
 - Anthropic - return consistent json response format on streaming/non-streaming - [PR](https://github.com/BerriAI/litellm/pull/9437)
 ## Spend Tracking Improvements
 - `litellm_proxy/` - support reading litellm response cost header from proxy, when using client sdk 
 - Reset Budget Job - fix budget reset error on keys/teams/users [PR](https://github.com/BerriAI/litellm/pull/9329)
 - Streaming - Prevents final chunk w/ usage from being ignored (impacted bedrock streaming + cost tracking) [PR](https://github.com/BerriAI/litellm/pull/9314)
 ## UI
 1. Users Page
   - Feature: Control default internal user settings [PR](https://github.com/BerriAI/litellm/pull/9328)
 2. Icons:
   - Feature: Replace external "artificialanalysis.ai" icons by local svg [PR](https://github.com/BerriAI/litellm/pull/9374)
 3. Sign In/Sign Out
   - Fix: Default login when `default_user_id` user does not exist in DB [PR](https://github.com/BerriAI/litellm/pull/9395)
 ## Logging Integrations
 - Support post-call guardrails for streaming responses [Get Started](../../docs/proxy/guardrails/custom_guardrail#1-write-a-customguardrail-class)
 - Arize [Get Started](../../docs/observability/arize_integration)
   - fix invalid package import [PR](https://github.com/BerriAI/litellm/pull/9338)
   - migrate to using standardloggingpayload for metadata, ensures spans land successfully [PR](https://github.com/BerriAI/litellm/pull/9338)
   - fix logging to just log the LLM I/O [PR](https://github.com/BerriAI/litellm/pull/9353)
   - Dynamic API Key/Space param support [Get Started](../../docs/observability/arize_integration#pass-arize-spacekey-per-request)
 - StandardLoggingPayload - Log litellm_model_name in payload. Allows knowing what the model sent to API provider was [Get Started](../../docs/proxy/logging_spec#standardlogginghiddenparams)
 - Prompt Management - Allow building custom prompt management integration [Get Started](../../docs/proxy/custom_prompt_management.md)
 ## Performance / Reliability improvements
 - Redis Caching - add 5s default timeout, prevents hanging redis connection from impacting llm calls [PR](https://github.com/BerriAI/litellm/commit/db92956ae33ed4c4e3233d7e1b0c7229817159bf)
 - Allow disabling all spend updates / writes to DB - patch to allow disabling all spend updates to DB with a flag [PR](https://github.com/BerriAI/litellm/pull/9331)
 - Azure OpenAI - correctly re-use azure openai client, fixes perf issue from previous Stable release [PR](https://github.com/BerriAI/litellm/commit/f2026ef907c06d94440930917add71314b901413)
 - Azure OpenAI - uses litellm.ssl_verify on Azure/OpenAI clients [PR](https://github.com/BerriAI/litellm/commit/f2026ef907c06d94440930917add71314b901413)
 - Usage-based routing - Wildcard model support [Get Started](../../docs/proxy/usage_based_routing#wildcard-model-support)
 - Usage-based routing - Support batch writing increments to redis - reduces latency to same as ‘simple-shuffle’ [PR](https://github.com/BerriAI/litellm/pull/9357)
 - Router - show reason for model cooldown on ‘no healthy deployments available error’ [PR](https://github.com/BerriAI/litellm/pull/9438)
 - Caching - add max value limit to an item in in-memory cache (1MB) - prevents OOM errors on large image url’s being sent through proxy [PR](https://github.com/BerriAI/litellm/pull/9448)
 ## General Improvements
 - Passthrough Endpoints - support returning api-base on pass-through endpoints Response Headers [Docs](../../docs/proxy/response_headers#litellm-specific-headers)
 - SSL - support reading ssl security level from env var - Allows user to specify lower security settings [Get Started](../../docs/guides/security_settings)
 - Credentials - only poll Credentials table when `STORE_MODEL_IN_DB` is True [PR](https://github.com/BerriAI/litellm/pull/9376)
 - Image URL Handling - new architecture doc on image url handling [Docs](../../docs/proxy/image_handling)
 - OpenAI - bump to pip install "openai==1.68.2" [PR](https://github.com/BerriAI/litellm/commit/e85e3bc52a9de86ad85c3dbb12d87664ee567a5a)
 - Gunicorn - security fix - bump gunicorn==23.0.0 [PR](https://github.com/BerriAI/litellm/commit/7e9fc92f5c7fea1e7294171cd3859d55384166eb)
 ## Complete Git Diff
 [Here's the complete git diff](https://github.com/BerriAI/litellm/compare/v1.63.11-stable...v1.63.14.rc)
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -243,7 +243,9 @@ const sidebars = {
        "exception_mapping",
        "completion/provider_specific_params",
        "guides/finetuned_models",
        "guides/security_settings",
        "completion/audio",
        "completion/web_search",
        "completion/document_understanding",
        "completion/vision",
        "completion/json_mode",
--- a/litellm/init.py
+++ b/litellm/init.py
@ -122,6 +122,9 @@ langsmith_batch_size: Optional[int] = None
 prometheus_initialize_budget_metrics: Optional[bool] = False
 argilla_batch_size: Optional[int] = None
 datadog_use_v1: Optional[bool] = False  # if you want to use v1 datadog logged payload
 gcs_pub_sub_use_v1: Optional[bool] = (
    False  # if you want to use v1 gcs pubsub logged payload
 )
 argilla_transformation_object: Optional[Dict[str, Any]] = None
 _async_input_callback: List[Union[str, Callable, CustomLogger]] = (
    []
@ -756,6 +759,7 @@ from .utils import (
    create_pretrained_tokenizer,
    create_tokenizer,
    supports_function_calling,
    supports_web_search,
    supports_response_schema,
    supports_parallel_function_calling,
    supports_vision,
--- a/litellm/caching/caching.py
+++ b/litellm/caching/caching.py
@ -88,16 +88,16 @@ class Cache:
        s3_aws_session_token: Optional[str] = None,
        s3_config: Optional[Any] = None,
        s3_path: Optional[str] = None,
-        redis_semantic_cache_use_async=False,
+        redis_semantic_cache_embedding_model: str = "text-embedding-ada-002",
-        redis_semantic_cache_embedding_model="text-embedding-ada-002",
+        redis_semantic_cache_index_name: Optional[str] = None,
        redis_flush_size: Optional[int] = None,
        redis_startup_nodes: Optional[List] = None,
-        disk_cache_dir=None,
+        disk_cache_dir: Optional[str] = None,
        qdrant_api_base: Optional[str] = None,
        qdrant_api_key: Optional[str] = None,
        qdrant_collection_name: Optional[str] = None,
        qdrant_quantization_config: Optional[str] = None,
-        qdrant_semantic_cache_embedding_model="text-embedding-ada-002",
+        qdrant_semantic_cache_embedding_model: str = "text-embedding-ada-002",
        **kwargs,
    ):
        """
@ -170,8 +170,8 @@ class Cache:
                port=port,
                password=password,
                similarity_threshold=similarity_threshold,
                use_async=redis_semantic_cache_use_async,
                embedding_model=redis_semantic_cache_embedding_model,
                index_name=redis_semantic_cache_index_name,
                **kwargs,
            )
        elif type == LiteLLMCacheType.QDRANT_SEMANTIC:
--- a/litellm/caching/redis_semantic_cache.py
+++ b/litellm/caching/redis_semantic_cache.py
@ -1,337 +1,437 @@
 """
-Redis Semantic Cache implementation
+Redis Semantic Cache implementation for LiteLLM
-Has 4 methods:
+The RedisSemanticCache provides semantic caching functionality using Redis as a backend.
-    - set_cache
+This cache stores responses based on the semantic similarity of prompts rather than
-    - get_cache
+exact matching, allowing for more flexible caching of LLM responses.
-    - async_set_cache
+
-    - async_get_cache
+This implementation uses RedisVL's SemanticCache to find semantically similar prompts
 and their cached responses.
 """
 import ast
 import asyncio
 import json
-from typing import Any
+import os
 from typing import Any, Dict, List, Optional, Tuple
 import litellm
 from litellm._logging import print_verbose
-
+from litellm.litellm_core_utils.prompt_templates.common_utils import get_str_from_messages
 from .base_cache import BaseCache
 class RedisSemanticCache(BaseCache):
    """
    Redis-backed semantic cache for LLM responses. 
    This cache uses vector similarity to find semantically similar prompts that have been 
    previously sent to the LLM, allowing for cache hits even when prompts are not identical
    but carry similar meaning.
    """
    DEFAULT_REDIS_INDEX_NAME: str = "litellm_semantic_cache_index"
    def __init__(
        self,
-        host=None,
+        host: Optional[str] = None,
-        port=None,
+        port: Optional[str] = None,
-        password=None,
+        password: Optional[str] = None,
-        redis_url=None,
+        redis_url: Optional[str] = None,
-        similarity_threshold=None,
+        similarity_threshold: Optional[float] = None,
-        use_async=False,
+        embedding_model: str = "text-embedding-ada-002",
-        embedding_model="text-embedding-ada-002",
+        index_name: Optional[str] = None,
        **kwargs,
    ):
        from redisvl.index import SearchIndex
        print_verbose(
            "redis semantic-cache initializing INDEX - litellm_semantic_cache_index"
        )
        if similarity_threshold is None:
            raise Exception("similarity_threshold must be provided, passed None")
        self.similarity_threshold = similarity_threshold
        self.embedding_model = embedding_model
        schema = {
            "index": {
                "name": "litellm_semantic_cache_index",
                "prefix": "litellm",
                "storage_type": "hash",
            },
            "fields": {
                "text": [{"name": "response"}],
                "vector": [
                    {
                        "name": "litellm_embedding",
                        "dims": 1536,
                        "distance_metric": "cosine",
                        "algorithm": "flat",
                        "datatype": "float32",
                    }
                ],
            },
        }
        if redis_url is None:
            # if no url passed, check if host, port and password are passed, if not raise an Exception
            if host is None or port is None or password is None:
                # try checking env for host, port and password
                import os
                host = os.getenv("REDIS_HOST")
                port = os.getenv("REDIS_PORT")
                password = os.getenv("REDIS_PASSWORD")
                if host is None or port is None or password is None:
                    raise Exception("Redis host, port, and password must be provided")
            redis_url = "redis://:" + password + "@" + host + ":" + port
        print_verbose(f"redis semantic-cache redis_url: {redis_url}")
        if use_async is False:
            self.index = SearchIndex.from_dict(schema)
            self.index.connect(redis_url=redis_url)
            try:
                self.index.create(overwrite=False)  # don't overwrite existing index
            except Exception as e:
                print_verbose(f"Got exception creating semantic cache index: {str(e)}")
        elif use_async is True:
            schema["index"]["name"] = "litellm_semantic_cache_index_async"
            self.index = SearchIndex.from_dict(schema)
            self.index.connect(redis_url=redis_url, use_async=True)
    #
    def _get_cache_logic(self, cached_response: Any):
        """
-        Common 'get_cache_logic' across sync + async redis client implementations
+        Initialize the Redis Semantic Cache.
        Args:
            host: Redis host address
            port: Redis port
            password: Redis password
            redis_url: Full Redis URL (alternative to separate host/port/password)
            similarity_threshold: Threshold for semantic similarity (0.0 to 1.0)
                where 1.0 requires exact matches and 0.0 accepts any match
            embedding_model: Model to use for generating embeddings
            index_name: Name for the Redis index
            ttl: Default time-to-live for cache entries in seconds
            **kwargs: Additional arguments passed to the Redis client
        Raises:
            Exception: If similarity_threshold is not provided or required Redis
                connection information is missing
        """
        from redisvl.extensions.llmcache import SemanticCache
        from redisvl.utils.vectorize import CustomTextVectorizer
        if index_name is None:
            index_name = self.DEFAULT_REDIS_INDEX_NAME
        print_verbose(f"Redis semantic-cache initializing index - {index_name}")
        # Validate similarity threshold
        if similarity_threshold is None:
            raise ValueError("similarity_threshold must be provided, passed None")
        # Store configuration
        self.similarity_threshold = similarity_threshold
        # Convert similarity threshold [0,1] to distance threshold [0,2]
        # For cosine distance: 0 = most similar, 2 = least similar
        # While similarity: 1 = most similar, 0 = least similar
        self.distance_threshold = 1 - similarity_threshold
        self.embedding_model = embedding_model
        # Set up Redis connection
        if redis_url is None:
            try:
                # Attempt to use provided parameters or fallback to environment variables
                host = host or os.environ['REDIS_HOST']
                port = port or os.environ['REDIS_PORT']
                password = password or os.environ['REDIS_PASSWORD']
            except KeyError as e:
                # Raise a more informative exception if any of the required keys are missing
                missing_var = e.args[0]
                raise ValueError(f"Missing required Redis configuration: {missing_var}. "
                                 f"Provide {missing_var} or redis_url.") from e
            redis_url = f"redis://:{password}@{host}:{port}"
        print_verbose(f"Redis semantic-cache redis_url: {redis_url}")
        # Initialize the Redis vectorizer and cache
        cache_vectorizer = CustomTextVectorizer(self._get_embedding)
        self.llmcache = SemanticCache(
            name=index_name,
            redis_url=redis_url,
            vectorizer=cache_vectorizer,
            distance_threshold=self.distance_threshold,
            overwrite=False,
        )
    def _get_ttl(self, **kwargs) -> Optional[int]:
        """
        Get the TTL (time-to-live) value for cache entries.
        Args:
            **kwargs: Keyword arguments that may contain a custom TTL
        Returns:
            Optional[int]: The TTL value in seconds, or None if no TTL should be applied
        """
        ttl = kwargs.get("ttl")
        if ttl is not None:
            ttl = int(ttl)
        return ttl
    def _get_embedding(self, prompt: str) -> List[float]:
        """
        Generate an embedding vector for the given prompt using the configured embedding model.
        Args:
            prompt: The text to generate an embedding for
        Returns:
            List[float]: The embedding vector
        """
        # Create an embedding from prompt
        embedding_response = litellm.embedding(
            model=self.embedding_model,
            input=prompt,
            cache={"no-store": True, "no-cache": True},
        )
        embedding = embedding_response["data"][0]["embedding"]
        return embedding
    def _get_cache_logic(self, cached_response: Any) -> Any:
        """
        Process the cached response to prepare it for use.
        Args:
            cached_response: The raw cached response
        Returns:
            The processed cache response, or None if input was None
        """
        if cached_response is None:
            return cached_response
-        # check if cached_response is bytes
+        # Convert bytes to string if needed
        if isinstance(cached_response, bytes):
            cached_response = cached_response.decode("utf-8")
        # Convert string representation to Python object
        try:
-            cached_response = json.loads(
+            cached_response = json.loads(cached_response)
-                cached_response
+        except json.JSONDecodeError:
-            )  # Convert string to dictionary
+            try:
-        except Exception:
+                cached_response = ast.literal_eval(cached_response)
-            cached_response = ast.literal_eval(cached_response)
+            except (ValueError, SyntaxError) as e:
-        return cached_response
+                print_verbose(f"Error parsing cached response: {str(e)}")
    def set_cache(self, key, value, **kwargs):
        import numpy as np
        print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}")
        # get the prompt
        messages = kwargs["messages"]
        prompt = "".join(message["content"] for message in messages)
        # create an embedding for prompt
        embedding_response = litellm.embedding(
            model=self.embedding_model,
            input=prompt,
            cache={"no-store": True, "no-cache": True},
        )
        # get the embedding
        embedding = embedding_response["data"][0]["embedding"]
        # make the embedding a numpy array, convert to bytes
        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
        value = str(value)
        assert isinstance(value, str)
        new_data = [
            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
        ]
        # Add more data
        self.index.load(new_data)
        return
    def get_cache(self, key, **kwargs):
        print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}")
        from redisvl.query import VectorQuery
        # query
        # get the messages
        messages = kwargs["messages"]
        prompt = "".join(message["content"] for message in messages)
        # convert to embedding
        embedding_response = litellm.embedding(
            model=self.embedding_model,
            input=prompt,
            cache={"no-store": True, "no-cache": True},
        )
        # get the embedding
        embedding = embedding_response["data"][0]["embedding"]
        query = VectorQuery(
            vector=embedding,
            vector_field_name="litellm_embedding",
            return_fields=["response", "prompt", "vector_distance"],
            num_results=1,
        )
        results = self.index.query(query)
        if results is None:
            return None
        if isinstance(results, list):
            if len(results) == 0:
                return None
-        vector_distance = results[0]["vector_distance"]
+        return cached_response
        vector_distance = float(vector_distance)
        similarity = 1 - vector_distance
        cached_prompt = results[0]["prompt"]
-        # check similarity, if more than self.similarity_threshold, return results
+    def set_cache(self, key: str, value: Any, **kwargs) -> None:
-        print_verbose(
+        """
-            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
+        Store a value in the semantic cache.
        )
        if similarity > self.similarity_threshold:
            # cache hit !
            cached_value = results[0]["response"]
            print_verbose(
                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
            )
            return self._get_cache_logic(cached_response=cached_value)
        else:
            # cache miss !
            return None
-        pass
+        Args:
-
+            key: The cache key (not directly used in semantic caching)
-    async def async_set_cache(self, key, value, **kwargs):
+            value: The response value to cache
-        import numpy as np
+            **kwargs: Additional arguments including 'messages' for the prompt
-
+                and optional 'ttl' for time-to-live
-        from litellm.proxy.proxy_server import llm_model_list, llm_router
+        """
        print_verbose(f"Redis semantic-cache set_cache, kwargs: {kwargs}")
        try:
-            await self.index.acreate(overwrite=False)  # don't overwrite existing index
+            # Extract the prompt from messages
            messages = kwargs.get("messages", [])
            if not messages:
                print_verbose("No messages provided for semantic caching")
                return
            prompt = get_str_from_messages(messages)
            value_str = str(value)
            # Get TTL and store in Redis semantic cache
            ttl = self._get_ttl(**kwargs)
            if ttl is not None:
                self.llmcache.store(prompt, value_str, ttl=int(ttl))
            else:
                self.llmcache.store(prompt, value_str)
        except Exception as e:
-            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
+            print_verbose(f"Error setting {value_str} in the Redis semantic cache: {str(e)}")
        print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
-        # get the prompt
+    def get_cache(self, key: str, **kwargs) -> Any:
-        messages = kwargs["messages"]
+        """
-        prompt = "".join(message["content"] for message in messages)
+        Retrieve a semantically similar cached response.
-        # create an embedding for prompt
+        
-        router_model_names = (
+        Args:
-            [m["model_name"] for m in llm_model_list]
+            key: The cache key (not directly used in semantic caching)
-            if llm_model_list is not None
+            **kwargs: Additional arguments including 'messages' for the prompt
-            else []
+            
-        )
+        Returns:
-        if llm_router is not None and self.embedding_model in router_model_names:
+            The cached response if a semantically similar prompt is found, else None
-            user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
+        """
-            embedding_response = await llm_router.aembedding(
+        print_verbose(f"Redis semantic-cache get_cache, kwargs: {kwargs}")
-                model=self.embedding_model,
+
-                input=prompt,
+        try:
-                cache={"no-store": True, "no-cache": True},
+            # Extract the prompt from messages
-                metadata={
+            messages = kwargs.get("messages", [])
-                    "user_api_key": user_api_key,
+            if not messages:
-                    "semantic-cache-embedding": True,
+                print_verbose("No messages provided for semantic cache lookup")
-                    "trace_id": kwargs.get("metadata", {}).get("trace_id", None),
+                return None
-                },
+                
-            )
+            prompt = get_str_from_messages(messages)
-        else:
+            # Check the cache for semantically similar prompts
-            # convert to embedding
+            results = self.llmcache.check(prompt=prompt)
-            embedding_response = await litellm.aembedding(
+
-                model=self.embedding_model,
+            # Return None if no similar prompts found
-                input=prompt,
+            if not results:
-                cache={"no-store": True, "no-cache": True},
+                return None
            # Process the best matching result
            cache_hit = results[0]
            vector_distance = float(cache_hit["vector_distance"])
            # Convert vector distance back to similarity score
            # For cosine distance: 0 = most similar, 2 = least similar
            # While similarity: 1 = most similar, 0 = least similar
            similarity = 1 - vector_distance
            cached_prompt = cache_hit["prompt"]
            cached_response = cache_hit["response"]
            print_verbose(
                f"Cache hit: similarity threshold: {self.similarity_threshold}, "
                f"actual similarity: {similarity}, "
                f"current prompt: {prompt}, "
                f"cached prompt: {cached_prompt}"
            )
-        # get the embedding
+            return self._get_cache_logic(cached_response=cached_response)
-        embedding = embedding_response["data"][0]["embedding"]
+        except Exception as e:
            print_verbose(f"Error retrieving from Redis semantic cache: {str(e)}")
-        # make the embedding a numpy array, convert to bytes
+    async def _get_async_embedding(self, prompt: str, **kwargs) -> List[float]:
-        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
+        """
-        value = str(value)
+        Asynchronously generate an embedding for the given prompt.
        assert isinstance(value, str)
-        new_data = [
+        Args:
-            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
+            prompt: The text to generate an embedding for
-        ]
+            **kwargs: Additional arguments that may contain metadata
        # Add more data
        await self.index.aload(new_data)
        return
    async def async_get_cache(self, key, **kwargs):
        print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
        from redisvl.query import VectorQuery
        Returns:
            List[float]: The embedding vector
        """
        from litellm.proxy.proxy_server import llm_model_list, llm_router
-        # query
+        # Route the embedding request through the proxy if appropriate
        # get the messages
        messages = kwargs["messages"]
        prompt = "".join(message["content"] for message in messages)
        router_model_names = (
            [m["model_name"] for m in llm_model_list]
            if llm_model_list is not None
            else []
        )
        if llm_router is not None and self.embedding_model in router_model_names:
            user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
            embedding_response = await llm_router.aembedding(
                model=self.embedding_model,
                input=prompt,
                cache={"no-store": True, "no-cache": True},
                metadata={
                    "user_api_key": user_api_key,
                    "semantic-cache-embedding": True,
                    "trace_id": kwargs.get("metadata", {}).get("trace_id", None),
                },
            )
        else:
            # convert to embedding
            embedding_response = await litellm.aembedding(
                model=self.embedding_model,
                input=prompt,
                cache={"no-store": True, "no-cache": True},
            )
-        # get the embedding
+        try:
-        embedding = embedding_response["data"][0]["embedding"]
+            if llm_router is not None and self.embedding_model in router_model_names:
                # Use the router for embedding generation
                user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
                embedding_response = await llm_router.aembedding(
                    model=self.embedding_model,
                    input=prompt,
                    cache={"no-store": True, "no-cache": True},
                    metadata={
                        "user_api_key": user_api_key,
                        "semantic-cache-embedding": True,
                        "trace_id": kwargs.get("metadata", {}).get("trace_id", None),
                    },
                )
            else:
                # Generate embedding directly
                embedding_response = await litellm.aembedding(
                    model=self.embedding_model,
                    input=prompt,
                    cache={"no-store": True, "no-cache": True},
                )
-        query = VectorQuery(
+            # Extract and return the embedding vector
-            vector=embedding,
+            return embedding_response["data"][0]["embedding"]
-            vector_field_name="litellm_embedding",
+        except Exception as e:
-            return_fields=["response", "prompt", "vector_distance"],
+            print_verbose(f"Error generating async embedding: {str(e)}")
-        )
+            raise ValueError(f"Failed to generate embedding: {str(e)}") from e
-        results = await self.index.aquery(query)
+
-        if results is None:
+    async def async_set_cache(self, key: str, value: Any, **kwargs) -> None:
-            kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
+        """
-            return None
+        Asynchronously store a value in the semantic cache.
-        if isinstance(results, list):
+        
-            if len(results) == 0:
+        Args:
            key: The cache key (not directly used in semantic caching)
            value: The response value to cache
            **kwargs: Additional arguments including 'messages' for the prompt
                and optional 'ttl' for time-to-live
        """
        print_verbose(f"Async Redis semantic-cache set_cache, kwargs: {kwargs}")
        try:
            # Extract the prompt from messages
            messages = kwargs.get("messages", [])
            if not messages:
                print_verbose("No messages provided for semantic caching")
                return
            prompt = get_str_from_messages(messages)
            value_str = str(value)
            # Generate embedding for the value (response) to cache
            prompt_embedding = await self._get_async_embedding(prompt, **kwargs)
            # Get TTL and store in Redis semantic cache
            ttl = self._get_ttl(**kwargs)
            if ttl is not None:
                await self.llmcache.astore(
                    prompt,
                    value_str,
                    vector=prompt_embedding,  # Pass through custom embedding
                    ttl=ttl
                )
            else:
                await self.llmcache.astore(
                    prompt,
                    value_str,
                    vector=prompt_embedding  # Pass through custom embedding
                )
        except Exception as e:
            print_verbose(f"Error in async_set_cache: {str(e)}")
    async def async_get_cache(self, key: str, **kwargs) -> Any:
        """
        Asynchronously retrieve a semantically similar cached response.
        Args:
            key: The cache key (not directly used in semantic caching)
            **kwargs: Additional arguments including 'messages' for the prompt
        Returns:
            The cached response if a semantically similar prompt is found, else None
        """
        print_verbose(f"Async Redis semantic-cache get_cache, kwargs: {kwargs}")
        try:
            # Extract the prompt from messages
            messages = kwargs.get("messages", [])
            if not messages:
                print_verbose("No messages provided for semantic cache lookup")
                kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
                return None
-        vector_distance = results[0]["vector_distance"]
+            prompt = get_str_from_messages(messages)
        vector_distance = float(vector_distance)
        similarity = 1 - vector_distance
        cached_prompt = results[0]["prompt"]
-        # check similarity, if more than self.similarity_threshold, return results
+            # Generate embedding for the prompt
-        print_verbose(
+            prompt_embedding = await self._get_async_embedding(prompt, **kwargs)
            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
        )
-        # update kwargs["metadata"] with similarity, don't rewrite the original metadata
+            # Check the cache for semantically similar prompts
-        kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
+            results = await self.llmcache.acheck(
-
+                prompt=prompt,
-        if similarity > self.similarity_threshold:
+                vector=prompt_embedding
            # cache hit !
            cached_value = results[0]["response"]
            print_verbose(
                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
            )
            return self._get_cache_logic(cached_response=cached_value)
        else:
            # cache miss !
            return None
        pass
-    async def _index_info(self):
+            # handle results / cache hit
-        return await self.index.ainfo()
+            if not results:
                kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0 # TODO why here but not above??
                return None
-    async def async_set_cache_pipeline(self, cache_list, **kwargs):
+            cache_hit = results[0]
-        tasks = []
+            vector_distance = float(cache_hit["vector_distance"])
-        for val in cache_list:
+
-            tasks.append(self.async_set_cache(val[0], val[1], **kwargs))
+            # Convert vector distance back to similarity
-        await asyncio.gather(*tasks)
+            # For cosine distance: 0 = most similar, 2 = least similar
            # While similarity: 1 = most similar, 0 = least similar
            similarity = 1 - vector_distance
            cached_prompt = cache_hit["prompt"]
            cached_response = cache_hit["response"]
            # update kwargs["metadata"] with similarity, don't rewrite the original metadata
            kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
            print_verbose(
                f"Cache hit: similarity threshold: {self.similarity_threshold}, "
                f"actual similarity: {similarity}, "
                f"current prompt: {prompt}, "
                f"cached prompt: {cached_prompt}"
            )
            return self._get_cache_logic(cached_response=cached_response)
        except Exception as e:
            print_verbose(f"Error in async_get_cache: {str(e)}")
            kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
    async def _index_info(self) -> Dict[str, Any]:
        """
        Get information about the Redis index.
        Returns:
            Dict[str, Any]: Information about the Redis index
        """
        aindex = await self.llmcache._get_async_index()
        return await aindex.info()
    async def async_set_cache_pipeline(self, cache_list: List[Tuple[str, Any]], **kwargs) -> None:
        """
        Asynchronously store multiple values in the semantic cache.
        Args:
            cache_list: List of (key, value) tuples to cache
            **kwargs: Additional arguments
        """
        try:
            tasks = []
            for val in cache_list:
                tasks.append(self.async_set_cache(val[0], val[1], **kwargs))
            await asyncio.gather(*tasks)
        except Exception as e:
            print_verbose(f"Error in async_set_cache_pipeline: {str(e)}")
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -9,6 +9,9 @@ from pydantic import BaseModel
 import litellm
 import litellm._logging
 from litellm import verbose_logger
 from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
    StandardBuiltInToolCostTracking,
 )
 from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character
 from litellm.llms.anthropic.cost_calculation import (
    cost_per_token as anthropic_cost_per_token,
@ -57,6 +60,7 @@ from litellm.types.utils import (
    LlmProvidersSet,
    ModelInfo,
    PassthroughCallTypes,
    StandardBuiltInToolsParams,
    Usage,
 )
 from litellm.utils import (
@ -524,6 +528,7 @@ def completion_cost(  # noqa: PLR0915
    optional_params: Optional[dict] = None,
    custom_pricing: Optional[bool] = None,
    base_model: Optional[str] = None,
    standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
 ) -> float:
    """
    Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
@ -802,6 +807,12 @@ def completion_cost(  # noqa: PLR0915
            rerank_billed_units=rerank_billed_units,
        )
        _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
        _final_cost += StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
            model=model,
            response_object=completion_response,
            standard_built_in_tools_params=standard_built_in_tools_params,
            custom_llm_provider=custom_llm_provider,
        )
        return _final_cost
    except Exception as e:
@ -861,6 +872,7 @@ def response_cost_calculator(
    base_model: Optional[str] = None,
    custom_pricing: Optional[bool] = None,
    prompt: str = "",
    standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
 ) -> float:
    """
    Returns
@ -890,6 +902,7 @@ def response_cost_calculator(
                custom_pricing=custom_pricing,
                base_model=base_model,
                prompt=prompt,
                standard_built_in_tools_params=standard_built_in_tools_params,
            )
        return response_cost
    except Exception as e:
--- a/litellm/integrations/gcs_pubsub/pub_sub.py
+++ b/litellm/integrations/gcs_pubsub/pub_sub.py
@ -10,13 +10,16 @@ import asyncio
 import json
 import os
 import traceback
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 from litellm.types.utils import StandardLoggingPayload
 if TYPE_CHECKING:
    from litellm.proxy._types import SpendLogsPayload
 else:
    SpendLogsPayload = Any
 import litellm
 from litellm._logging import verbose_logger
 from litellm.integrations.custom_batch_logger import CustomBatchLogger
 from litellm.llms.custom_httpx.http_handler import (
@ -61,7 +64,7 @@ class GcsPubSubLogger(CustomBatchLogger):
        self.flush_lock = asyncio.Lock()
        super().__init__(**kwargs, flush_lock=self.flush_lock)
        asyncio.create_task(self.periodic_flush())
-        self.log_queue: List[SpendLogsPayload] = []
+        self.log_queue: List[Union[SpendLogsPayload, StandardLoggingPayload]] = []
    async def construct_request_headers(self) -> Dict[str, str]:
        """Construct authorization headers using Vertex AI auth"""
@ -115,13 +118,20 @@ class GcsPubSubLogger(CustomBatchLogger):
            verbose_logger.debug(
                "PubSub: Logging - Enters logging function for model %s", kwargs
            )
-            spend_logs_payload = get_logging_payload(
+            standard_logging_payload = kwargs.get("standard_logging_object", None)
-                kwargs=kwargs,
+
-                response_obj=response_obj,
+            # Backwards compatibility with old logging payload
-                start_time=start_time,
+            if litellm.gcs_pub_sub_use_v1 is True:
-                end_time=end_time,
+                spend_logs_payload = get_logging_payload(
-            )
+                    kwargs=kwargs,
-            self.log_queue.append(spend_logs_payload)
+                    response_obj=response_obj,
                    start_time=start_time,
                    end_time=end_time,
                )
                self.log_queue.append(spend_logs_payload)
            else:
                # New logging payload, StandardLoggingPayload
                self.log_queue.append(standard_logging_payload)
            if len(self.log_queue) >= self.batch_size:
                await self.async_send_batch()
@ -155,7 +165,7 @@ class GcsPubSubLogger(CustomBatchLogger):
            self.log_queue.clear()
    async def publish_message(
-        self, message: SpendLogsPayload
+        self, message: Union[SpendLogsPayload, StandardLoggingPayload]
    ) -> Optional[Dict[str, Any]]:
        """
        Publish message to Google Cloud Pub/Sub using REST API
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -35,6 +35,9 @@ from litellm.integrations.custom_logger import CustomLogger
 from litellm.integrations.mlflow import MlflowLogger
 from litellm.integrations.pagerduty.pagerduty import PagerDutyAlerting
 from litellm.litellm_core_utils.get_litellm_params import get_litellm_params
 from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
    StandardBuiltInToolCostTracking,
 )
 from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
 from litellm.litellm_core_utils.redact_messages import (
    redact_message_input_output_from_custom_logger,
@ -60,6 +63,7 @@ from litellm.types.utils import (
    ModelResponse,
    ModelResponseStream,
    RawRequestTypedDict,
    StandardBuiltInToolsParams,
    StandardCallbackDynamicParams,
    StandardLoggingAdditionalHeaders,
    StandardLoggingHiddenParams,
@ -264,7 +268,9 @@ class Logging(LiteLLMLoggingBaseClass):
        self.standard_callback_dynamic_params: StandardCallbackDynamicParams = (
            self.initialize_standard_callback_dynamic_params(kwargs)
        )
-
+        self.standard_built_in_tools_params: StandardBuiltInToolsParams = (
            self.initialize_standard_built_in_tools_params(kwargs)
        )
        ## TIME TO FIRST TOKEN LOGGING ##
        self.completion_start_time: Optional[datetime.datetime] = None
        self._llm_caching_handler: Optional[LLMCachingHandler] = None
@ -369,6 +375,23 @@ class Logging(LiteLLMLoggingBaseClass):
        """
        return _initialize_standard_callback_dynamic_params(kwargs)
    def initialize_standard_built_in_tools_params(
        self, kwargs: Optional[Dict] = None
    ) -> StandardBuiltInToolsParams:
        """
        Initialize the standard built-in tools params from the kwargs
        checks if web_search_options in kwargs or tools and sets the corresponding attribute in StandardBuiltInToolsParams
        """
        return StandardBuiltInToolsParams(
            web_search_options=StandardBuiltInToolCostTracking._get_web_search_options(
                kwargs or {}
            ),
            file_search=StandardBuiltInToolCostTracking._get_file_search_tool_call(
                kwargs or {}
            ),
        )
    def update_environment_variables(
        self,
        litellm_params: Dict,
@ -495,6 +518,16 @@ class Logging(LiteLLMLoggingBaseClass):
                }
        return data
    def _get_masked_api_base(self, api_base: str) -> str:
        if "key=" in api_base:
            # Find the position of "key=" in the string
            key_index = api_base.find("key=") + 4
            # Mask the last 5 characters after "key="
            masked_api_base = api_base[:key_index] + "*" * 5 + api_base[-4:]
        else:
            masked_api_base = api_base
        return str(masked_api_base)
    def _pre_call(self, input, api_key, model=None, additional_args={}):
        """
        Common helper function across the sync + async pre-call function
@ -508,6 +541,9 @@ class Logging(LiteLLMLoggingBaseClass):
            model
        ):  # if model name was changes pre-call, overwrite the initial model call name with the new one
            self.model_call_details["model"] = model
        self.model_call_details["litellm_params"]["api_base"] = (
            self._get_masked_api_base(additional_args.get("api_base", ""))
        )
    def pre_call(self, input, api_key, model=None, additional_args={}):  # noqa: PLR0915
@ -691,15 +727,6 @@ class Logging(LiteLLMLoggingBaseClass):
                    headers = {}
                data = additional_args.get("complete_input_dict", {})
                api_base = str(additional_args.get("api_base", ""))
                if "key=" in api_base:
                    # Find the position of "key=" in the string
                    key_index = api_base.find("key=") + 4
                    # Mask the last 5 characters after "key="
                    masked_api_base = api_base[:key_index] + "*" * 5 + api_base[-4:]
                else:
                    masked_api_base = api_base
                self.model_call_details["litellm_params"]["api_base"] = masked_api_base
                curl_command = self._get_request_curl_command(
                    api_base=api_base,
                    headers=headers,
@ -714,11 +741,12 @@ class Logging(LiteLLMLoggingBaseClass):
    def _get_request_curl_command(
        self, api_base: str, headers: Optional[dict], additional_args: dict, data: dict
    ) -> str:
        masked_api_base = self._get_masked_api_base(api_base)
        if headers is None:
            headers = {}
        curl_command = "\n\nPOST Request Sent from LiteLLM:\n"
        curl_command += "curl -X POST \\\n"
-        curl_command += f"{api_base} \\\n"
+        curl_command += f"{masked_api_base} \\\n"
        masked_headers = self._get_masked_headers(headers)
        formatted_headers = " ".join(
            [f"-H '{k}: {v}'" for k, v in masked_headers.items()]
@ -903,6 +931,7 @@ class Logging(LiteLLMLoggingBaseClass):
                "optional_params": self.optional_params,
                "custom_pricing": custom_pricing,
                "prompt": prompt,
                "standard_built_in_tools_params": self.standard_built_in_tools_params,
            }
        except Exception as e:  # error creating kwargs for cost calculation
            debug_info = StandardLoggingModelCostFailureDebugInformation(
@ -1067,6 +1096,7 @@ class Logging(LiteLLMLoggingBaseClass):
                            end_time=end_time,
                            logging_obj=self,
                            status="success",
                            standard_built_in_tools_params=self.standard_built_in_tools_params,
                        )
                    )
                elif isinstance(result, dict):  # pass-through endpoints
@ -1079,6 +1109,7 @@ class Logging(LiteLLMLoggingBaseClass):
                            end_time=end_time,
                            logging_obj=self,
                            status="success",
                            standard_built_in_tools_params=self.standard_built_in_tools_params,
                        )
                    )
            elif standard_logging_object is not None:
@ -1102,6 +1133,7 @@ class Logging(LiteLLMLoggingBaseClass):
                    prompt="",
                    completion=getattr(result, "content", ""),
                    total_time=float_diff,
                    standard_built_in_tools_params=self.standard_built_in_tools_params,
                )
            return start_time, end_time, result
@ -1155,6 +1187,7 @@ class Logging(LiteLLMLoggingBaseClass):
                        end_time=end_time,
                        logging_obj=self,
                        status="success",
                        standard_built_in_tools_params=self.standard_built_in_tools_params,
                    )
                )
            callbacks = self.get_combined_callback_list(
@ -1695,6 +1728,7 @@ class Logging(LiteLLMLoggingBaseClass):
                    end_time=end_time,
                    logging_obj=self,
                    status="success",
                    standard_built_in_tools_params=self.standard_built_in_tools_params,
                )
            )
        callbacks = self.get_combined_callback_list(
@ -1911,6 +1945,7 @@ class Logging(LiteLLMLoggingBaseClass):
                status="failure",
                error_str=str(exception),
                original_exception=exception,
                standard_built_in_tools_params=self.standard_built_in_tools_params,
            )
        )
        return start_time, end_time
@ -3367,6 +3402,7 @@ def get_standard_logging_object_payload(
    status: StandardLoggingPayloadStatus,
    error_str: Optional[str] = None,
    original_exception: Optional[Exception] = None,
    standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
 ) -> Optional[StandardLoggingPayload]:
    try:
        kwargs = kwargs or {}
@ -3542,6 +3578,7 @@ def get_standard_logging_object_payload(
            guardrail_information=metadata.get(
                "standard_logging_guardrail_information", None
            ),
            standard_built_in_tools_params=standard_built_in_tools_params,
        )
        emit_standard_logging_payload(payload)
--- a/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py
@ -0,0 +1,199 @@
 """
 Helper utilities for tracking the cost of built-in tools.
 """
 from typing import Any, Dict, List, Optional
 import litellm
 from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
 from litellm.types.utils import (
    ModelInfo,
    ModelResponse,
    SearchContextCostPerQuery,
    StandardBuiltInToolsParams,
 )
 class StandardBuiltInToolCostTracking:
    """
    Helper class for tracking the cost of built-in tools
    Example: Web Search
    """
    @staticmethod
    def get_cost_for_built_in_tools(
        model: str,
        response_object: Any,
        custom_llm_provider: Optional[str] = None,
        standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
    ) -> float:
        """
        Get the cost of using built-in tools.
        Supported tools:
        - Web Search
        """
        if standard_built_in_tools_params is not None:
            if (
                standard_built_in_tools_params.get("web_search_options", None)
                is not None
            ):
                model_info = StandardBuiltInToolCostTracking._safe_get_model_info(
                    model=model, custom_llm_provider=custom_llm_provider
                )
                return StandardBuiltInToolCostTracking.get_cost_for_web_search(
                    web_search_options=standard_built_in_tools_params.get(
                        "web_search_options", None
                    ),
                    model_info=model_info,
                )
            if standard_built_in_tools_params.get("file_search", None) is not None:
                return StandardBuiltInToolCostTracking.get_cost_for_file_search(
                    file_search=standard_built_in_tools_params.get("file_search", None),
                )
        if isinstance(response_object, ModelResponse):
            if StandardBuiltInToolCostTracking.chat_completion_response_includes_annotations(
                response_object
            ):
                model_info = StandardBuiltInToolCostTracking._safe_get_model_info(
                    model=model, custom_llm_provider=custom_llm_provider
                )
                return StandardBuiltInToolCostTracking.get_default_cost_for_web_search(
                    model_info
                )
        return 0.0
    @staticmethod
    def _safe_get_model_info(
        model: str, custom_llm_provider: Optional[str] = None
    ) -> Optional[ModelInfo]:
        try:
            return litellm.get_model_info(
                model=model, custom_llm_provider=custom_llm_provider
            )
        except Exception:
            return None
    @staticmethod
    def get_cost_for_web_search(
        web_search_options: Optional[WebSearchOptions] = None,
        model_info: Optional[ModelInfo] = None,
    ) -> float:
        """
        If request includes `web_search_options`, calculate the cost of the web search.
        """
        if web_search_options is None:
            return 0.0
        if model_info is None:
            return 0.0
        search_context_pricing: SearchContextCostPerQuery = (
            model_info.get("search_context_cost_per_query", {}) or {}
        )
        if web_search_options.get("search_context_size", None) == "low":
            return search_context_pricing.get("search_context_size_low", 0.0)
        elif web_search_options.get("search_context_size", None) == "medium":
            return search_context_pricing.get("search_context_size_medium", 0.0)
        elif web_search_options.get("search_context_size", None) == "high":
            return search_context_pricing.get("search_context_size_high", 0.0)
        return StandardBuiltInToolCostTracking.get_default_cost_for_web_search(
            model_info
        )
    @staticmethod
    def get_default_cost_for_web_search(
        model_info: Optional[ModelInfo] = None,
    ) -> float:
        """
        If no web search options are provided, use the `search_context_size_medium` pricing.
        https://platform.openai.com/docs/pricing#web-search
        """
        if model_info is None:
            return 0.0
        search_context_pricing: SearchContextCostPerQuery = (
            model_info.get("search_context_cost_per_query", {}) or {}
        ) or {}
        return search_context_pricing.get("search_context_size_medium", 0.0)
    @staticmethod
    def get_cost_for_file_search(
        file_search: Optional[FileSearchTool] = None,
    ) -> float:
        """ "
        Charged at $2.50/1k calls
        Doc: https://platform.openai.com/docs/pricing#built-in-tools
        """
        if file_search is None:
            return 0.0
        return 2.5 / 1000
    @staticmethod
    def chat_completion_response_includes_annotations(
        response_object: ModelResponse,
    ) -> bool:
        for _choice in response_object.choices:
            message = getattr(_choice, "message", None)
            if (
                message is not None
                and hasattr(message, "annotations")
                and message.annotations is not None
                and len(message.annotations) > 0
            ):
                return True
        return False
    @staticmethod
    def _get_web_search_options(kwargs: Dict) -> Optional[WebSearchOptions]:
        if "web_search_options" in kwargs:
            return WebSearchOptions(**kwargs.get("web_search_options", {}))
        tools = StandardBuiltInToolCostTracking._get_tools_from_kwargs(
            kwargs, "web_search_preview"
        )
        if tools:
            # Look for web search tool in the tools array
            for tool in tools:
                if isinstance(tool, dict):
                    if StandardBuiltInToolCostTracking._is_web_search_tool_call(tool):
                        return WebSearchOptions(**tool)
        return None
    @staticmethod
    def _get_tools_from_kwargs(kwargs: Dict, tool_type: str) -> Optional[List[Dict]]:
        if "tools" in kwargs:
            tools = kwargs.get("tools", [])
            return tools
        return None
    @staticmethod
    def _get_file_search_tool_call(kwargs: Dict) -> Optional[FileSearchTool]:
        tools = StandardBuiltInToolCostTracking._get_tools_from_kwargs(
            kwargs, "file_search"
        )
        if tools:
            for tool in tools:
                if isinstance(tool, dict):
                    if StandardBuiltInToolCostTracking._is_file_search_tool_call(tool):
                        return FileSearchTool(**tool)
        return None
    @staticmethod
    def _is_web_search_tool_call(tool: Dict) -> bool:
        if tool.get("type", None) == "web_search_preview":
            return True
        if "search_context_size" in tool:
            return True
        return False
    @staticmethod
    def _is_file_search_tool_call(tool: Dict) -> bool:
        if tool.get("type", None) == "file_search":
            return True
        return False
--- a/litellm/litellm_core_utils/model_param_helper.py
+++ b/litellm/litellm_core_utils/model_param_helper.py
@ -138,13 +138,22 @@ class ModelParamHelper:
                TranscriptionCreateParamsNonStreaming,
                TranscriptionCreateParamsStreaming,
            )
-            non_streaming_kwargs = set(getattr(TranscriptionCreateParamsNonStreaming, "__annotations__", {}).keys())
+
-            streaming_kwargs = set(getattr(TranscriptionCreateParamsStreaming, "__annotations__", {}).keys())
+            non_streaming_kwargs = set(
                getattr(
                    TranscriptionCreateParamsNonStreaming, "__annotations__", {}
                ).keys()
            )
            streaming_kwargs = set(
                getattr(
                    TranscriptionCreateParamsStreaming, "__annotations__", {}
                ).keys()
            )
            all_transcription_kwargs = non_streaming_kwargs.union(streaming_kwargs)
            return all_transcription_kwargs
        except Exception as e:
-            verbose_logger.warning("Error getting transcription kwargs %s", str(e))
+            verbose_logger.debug("Error getting transcription kwargs %s", str(e))
            return set()
    @staticmethod
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -5304,6 +5304,17 @@
        "mode": "embedding",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
    },
    "text-embedding-large-exp-03-07": {
        "max_tokens": 8192,
        "max_input_tokens": 8192,
        "output_vector_size": 3072,
        "input_cost_per_character": 0.000000025,
        "input_cost_per_token": 0.0000001,
        "output_cost_per_token": 0,
        "litellm_provider": "vertex_ai-embedding-models",
        "mode": "embedding",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
    },
    "textembedding-gecko": {
        "max_tokens": 3072,
        "max_input_tokens": 3072,
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -5,7 +5,10 @@ model_list:
    api_key: os.environ/AZURE_API_KEY
    api_base: http://0.0.0.0:8090
    rpm: 3
-    
+ - model_name: "gpt-4o-mini-openai"
   litellm_params:
    model: gpt-4o-mini
    api_key: os.environ/OPENAI_API_KEY
 litellm_settings:
  num_retries: 0
--- a/litellm/proxy/pass_through_endpoints/llm_passthrough_endpoints.py
+++ b/litellm/proxy/pass_through_endpoints/llm_passthrough_endpoints.py
@ -542,13 +542,10 @@ async def vertex_proxy_route(
            user_api_key_dict,
            stream=is_streaming_request,  # type: ignore
        )
-    except Exception as e:
+    except ProxyException as e:
        if headers_passed_through:
-            raise Exception(
+            e.message = f"No credentials found on proxy for project_name={vertex_project} + location={vertex_location}, check `/model/info` for allowed project + region combinations with `use_in_pass_through: true`. Headers were passed through directly but request failed with error: {e.message}"
-                f"No credentials found on proxy for this request. Headers were passed through directly but request failed with error: {str(e)}"
+        raise e
            )
        else:
            raise e
    return received_value
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -1788,9 +1788,6 @@ class ProxyConfig:
                            reset_color_code,
                            cache_password,
                        )
                    if cache_type == "redis-semantic":
                        # by default this should always be async
                        cache_params.update({"redis_semantic_cache_use_async": True})
                    # users can pass os.environ/ variables on the proxy - we should read them from the env
                    for key, value in cache_params.items():
@ -6181,18 +6178,18 @@ async def model_info_v1(  # noqa: PLR0915
    )
    if len(all_models_str) > 0:
-        model_names = all_models_str
+        _relevant_models = []
-        llm_model_list = llm_router.get_model_list()
+        for model in all_models_str:
            router_models = llm_router.get_model_list(model_name=model)
            if router_models is not None:
                _relevant_models.extend(router_models)
        if llm_model_list is not None:
            _relevant_models = [
                m for m in llm_model_list if m["model_name"] in model_names
            ]
            all_models = copy.deepcopy(_relevant_models)  # type: ignore
        else:
            all_models = []
-    for model in all_models:
+    for in_place_model in all_models:
-        model = _get_proxy_model_info(model=model)
+        in_place_model = _get_proxy_model_info(model=in_place_model)
    verbose_proxy_logger.debug("all_models: %s", all_models)
    return {"data": all_models}
--- a/litellm/router.py
+++ b/litellm/router.py
@ -4924,6 +4924,11 @@ class Router:
                    and model_info["supports_function_calling"] is True  # type: ignore
                ):
                    model_group_info.supports_function_calling = True
                if (
                    model_info.get("supports_web_search", None) is not None
                    and model_info["supports_web_search"] is True  # type: ignore
                ):
                    model_group_info.supports_web_search = True
                if (
                    model_info.get("supported_openai_params", None) is not None
                    and model_info["supported_openai_params"] is not None
@ -5286,10 +5291,11 @@ class Router:
            if len(returned_models) == 0:  # check if wildcard route
                potential_wildcard_models = self.pattern_router.route(model_name)
-                if potential_wildcard_models is not None:
+                if model_name is not None and potential_wildcard_models is not None:
-                    returned_models.extend(
+                    for m in potential_wildcard_models:
-                        [DeploymentTypedDict(**m) for m in potential_wildcard_models]  # type: ignore
+                        deployment_typed_dict = DeploymentTypedDict(**m)  # type: ignore
-                    )
+                        deployment_typed_dict["model_name"] = model_name
                        returned_models.append(deployment_typed_dict)
            if model_name is None:
                returned_models += self.model_list
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@ -382,6 +382,53 @@ class ChatCompletionThinkingBlock(TypedDict, total=False):
    cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
 class WebSearchOptionsUserLocationApproximate(TypedDict, total=False):
    city: str
    """Free text input for the city of the user, e.g. `San Francisco`."""
    country: str
    """
    The two-letter [ISO country code](https://en.wikipedia.org/wiki/ISO_3166-1) of
    the user, e.g. `US`.
    """
    region: str
    """Free text input for the region of the user, e.g. `California`."""
    timezone: str
    """
    The [IANA timezone](https://timeapi.io/documentation/iana-timezones) of the
    user, e.g. `America/Los_Angeles`.
    """
 class WebSearchOptionsUserLocation(TypedDict, total=False):
    approximate: Required[WebSearchOptionsUserLocationApproximate]
    """Approximate location parameters for the search."""
    type: Required[Literal["approximate"]]
    """The type of location approximation. Always `approximate`."""
 class WebSearchOptions(TypedDict, total=False):
    search_context_size: Literal["low", "medium", "high"]
    """
    High level guidance for the amount of context window space to use for the
    search. One of `low`, `medium`, or `high`. `medium` is the default.
    """
    user_location: Optional[WebSearchOptionsUserLocation]
    """Approximate location parameters for the search."""
 class FileSearchTool(TypedDict, total=False):
    type: Literal["file_search"]
    """The type of tool being defined: `file_search`"""
    vector_store_ids: Optional[List[str]]
    """The IDs of the vector stores to search."""
 class ChatCompletionAnnotationURLCitation(TypedDict, total=False):
    end_index: int
    """The index of the last character of the URL citation in the message."""
--- a/litellm/types/router.py
+++ b/litellm/types/router.py
@ -559,6 +559,7 @@ class ModelGroupInfo(BaseModel):
    rpm: Optional[int] = None
    supports_parallel_function_calling: bool = Field(default=False)
    supports_vision: bool = Field(default=False)
    supports_web_search: bool = Field(default=False)
    supports_function_calling: bool = Field(default=False)
    supported_openai_params: Optional[List[str]] = Field(default=[])
    configurable_clientside_auth_params: CONFIGURABLE_CLIENTSIDE_AUTH_PARAMS = None
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -32,7 +32,9 @@ from .llms.openai import (
    ChatCompletionThinkingBlock,
    ChatCompletionToolCallChunk,
    ChatCompletionUsageBlock,
    FileSearchTool,
    OpenAIChatCompletionChunk,
    WebSearchOptions,
 )
 from .rerank import RerankResponse
@ -97,6 +99,13 @@ class ProviderSpecificModelInfo(TypedDict, total=False):
    supports_pdf_input: Optional[bool]
    supports_native_streaming: Optional[bool]
    supports_parallel_function_calling: Optional[bool]
    supports_web_search: Optional[bool]
 class SearchContextCostPerQuery(TypedDict, total=False):
    search_context_size_low: float
    search_context_size_medium: float
    search_context_size_high: float
 class ModelInfoBase(ProviderSpecificModelInfo, total=False):
@ -135,6 +144,9 @@ class ModelInfoBase(ProviderSpecificModelInfo, total=False):
    output_cost_per_video_per_second: Optional[float]  # only for vertex ai models
    output_cost_per_audio_per_second: Optional[float]  # only for vertex ai models
    output_cost_per_second: Optional[float]  # for OpenAI Speech models
    search_context_cost_per_query: Optional[
        SearchContextCostPerQuery
    ]  # Cost for using web search tool
    litellm_provider: Required[str]
    mode: Required[
@ -586,6 +598,11 @@ class Message(OpenAIObject):
            # OpenAI compatible APIs like mistral API will raise an error if audio is passed in
            del self.audio
        if annotations is None:
            # ensure default response matches OpenAI spec
            # Some OpenAI compatible APIs raise an error if annotations are passed in
            del self.annotations
        if reasoning_content is None:
            # ensure default response matches OpenAI spec
            del self.reasoning_content
@ -1612,6 +1629,19 @@ class StandardLoggingUserAPIKeyMetadata(TypedDict):
    user_api_key_end_user_id: Optional[str]
 class StandardBuiltInToolsParams(TypedDict, total=False):
    """
    Standard built-in OpenAItools parameters
    This is used to calculate the cost of built-in tools, insert any standard built-in tools parameters here
    OpenAI charges users based on the `web_search_options` parameter
    """
    web_search_options: Optional[WebSearchOptions]
    file_search: Optional[FileSearchTool]
 class StandardLoggingPromptManagementMetadata(TypedDict):
    prompt_id: str
    prompt_variables: Optional[dict]
@ -1729,6 +1759,7 @@ class StandardLoggingPayload(TypedDict):
    model_parameters: dict
    hidden_params: StandardLoggingHiddenParams
    guardrail_information: Optional[StandardLoggingGuardrailInformation]
    standard_built_in_tools_params: Optional[StandardBuiltInToolsParams]
 from typing import AsyncIterator, Iterator
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -1975,7 +1975,7 @@ def supports_system_messages(model: str, custom_llm_provider: Optional[str]) ->
    )
-def supports_web_search(model: str, custom_llm_provider: Optional[str]) -> bool:
+def supports_web_search(model: str, custom_llm_provider: Optional[str] = None) -> bool:
    """
    Check if the given model supports web search and return a boolean value.
@ -4544,6 +4544,10 @@ def _get_model_info_helper(  # noqa: PLR0915
                supports_native_streaming=_model_info.get(
                    "supports_native_streaming", None
                ),
                supports_web_search=_model_info.get("supports_web_search", False),
                search_context_cost_per_query=_model_info.get(
                    "search_context_cost_per_query", None
                ),
                tpm=_model_info.get("tpm", None),
                rpm=_model_info.get("rpm", None),
            )
@ -4612,6 +4616,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
            supports_audio_input: Optional[bool]
            supports_audio_output: Optional[bool]
            supports_pdf_input: Optional[bool]
            supports_web_search: Optional[bool]
    Raises:
        Exception: If the model is not mapped yet.
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -5304,6 +5304,17 @@
        "mode": "embedding",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
    },
    "text-embedding-large-exp-03-07": {
        "max_tokens": 8192,
        "max_input_tokens": 8192,
        "output_vector_size": 3072,
        "input_cost_per_character": 0.000000025,
        "input_cost_per_token": 0.0000001,
        "output_cost_per_token": 0,
        "litellm_provider": "vertex_ai-embedding-models",
        "mode": "embedding",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
    },
    "textembedding-gecko": {
        "max_tokens": 3072,
        "max_input_tokens": 3072,
--- a/poetry.lock
+++ b/poetry.lock
@ -810,15 +810,15 @@ test = ["pytest (>=6)"]
 [[package]]
 name = "fastapi"
-version = "0.115.11"
+version = "0.115.12"
 description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
 optional = true
 python-versions = ">=3.8"
 groups = ["main"]
 markers = "extra == \"proxy\""
 files = [
-    {file = "fastapi-0.115.11-py3-none-any.whl", hash = "sha256:32e1541b7b74602e4ef4a0260ecaf3aadf9d4f19590bba3e1bf2ac4666aa2c64"},
+    {file = "fastapi-0.115.12-py3-none-any.whl", hash = "sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d"},
-    {file = "fastapi-0.115.11.tar.gz", hash = "sha256:cc81f03f688678b92600a65a5e618b93592c65005db37157147204d8924bf94f"},
+    {file = "fastapi-0.115.12.tar.gz", hash = "sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681"},
 ]
 [package.dependencies]
@ -1445,14 +1445,14 @@ type = ["pytest-mypy"]
 [[package]]
 name = "iniconfig"
-version = "2.0.0"
+version = "2.1.0"
 description = "brain-dead simple config-ini parsing"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 groups = ["dev"]
 files = [
-    {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
+    {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"},
-    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
+    {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"},
 ]
 [[package]]
@ -2137,14 +2137,14 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
 [[package]]
 name = "openai"
-version = "1.66.3"
+version = "1.68.2"
 description = "The official Python library for the openai API"
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "openai-1.66.3-py3-none-any.whl", hash = "sha256:a427c920f727711877ab17c11b95f1230b27767ba7a01e5b66102945141ceca9"},
+    {file = "openai-1.68.2-py3-none-any.whl", hash = "sha256:24484cb5c9a33b58576fdc5acf0e5f92603024a4e39d0b99793dfa1eb14c2b36"},
-    {file = "openai-1.66.3.tar.gz", hash = "sha256:8dde3aebe2d081258d4159c4cb27bdc13b5bb3f7ea2201d9bd940b9a89faf0c9"},
+    {file = "openai-1.68.2.tar.gz", hash = "sha256:b720f0a95a1dbe1429c0d9bb62096a0d98057bcda82516f6e8af10284bdd5b19"},
 ]
 [package.dependencies]
@ -2160,6 +2160,7 @@ typing-extensions = ">=4.11,<5"
 [package.extras]
 datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
 realtime = ["websockets (>=13,<15)"]
 voice-helpers = ["numpy (>=2.0.2)", "sounddevice (>=0.5.1)"]
 [[package]]
 name = "orjson"
@ -2477,24 +2478,24 @@ testing = ["google-api-core (>=1.31.5)"]
 [[package]]
 name = "protobuf"
-version = "5.29.3"
+version = "5.29.4"
 description = ""
 optional = true
 python-versions = ">=3.8"
 groups = ["main"]
 markers = "extra == \"extra-proxy\""
 files = [
-    {file = "protobuf-5.29.3-cp310-abi3-win32.whl", hash = "sha256:3ea51771449e1035f26069c4c7fd51fba990d07bc55ba80701c78f886bf9c888"},
+    {file = "protobuf-5.29.4-cp310-abi3-win32.whl", hash = "sha256:13eb236f8eb9ec34e63fc8b1d6efd2777d062fa6aaa68268fb67cf77f6839ad7"},
-    {file = "protobuf-5.29.3-cp310-abi3-win_amd64.whl", hash = "sha256:a4fa6f80816a9a0678429e84973f2f98cbc218cca434abe8db2ad0bffc98503a"},
+    {file = "protobuf-5.29.4-cp310-abi3-win_amd64.whl", hash = "sha256:bcefcdf3976233f8a502d265eb65ea740c989bacc6c30a58290ed0e519eb4b8d"},
-    {file = "protobuf-5.29.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a8434404bbf139aa9e1300dbf989667a83d42ddda9153d8ab76e0d5dcaca484e"},
+    {file = "protobuf-5.29.4-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:307ecba1d852ec237e9ba668e087326a67564ef83e45a0189a772ede9e854dd0"},
-    {file = "protobuf-5.29.3-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:daaf63f70f25e8689c072cfad4334ca0ac1d1e05a92fc15c54eb9cf23c3efd84"},
+    {file = "protobuf-5.29.4-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:aec4962f9ea93c431d5714ed1be1c93f13e1a8618e70035ba2b0564d9e633f2e"},
-    {file = "protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:c027e08a08be10b67c06bf2370b99c811c466398c357e615ca88c91c07f0910f"},
+    {file = "protobuf-5.29.4-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:d7d3f7d1d5a66ed4942d4fefb12ac4b14a29028b209d4bfb25c68ae172059922"},
-    {file = "protobuf-5.29.3-cp38-cp38-win32.whl", hash = "sha256:84a57163a0ccef3f96e4b6a20516cedcf5bb3a95a657131c5c3ac62200d23252"},
+    {file = "protobuf-5.29.4-cp38-cp38-win32.whl", hash = "sha256:1832f0515b62d12d8e6ffc078d7e9eb06969aa6dc13c13e1036e39d73bebc2de"},
-    {file = "protobuf-5.29.3-cp38-cp38-win_amd64.whl", hash = "sha256:b89c115d877892a512f79a8114564fb435943b59067615894c3b13cd3e1fa107"},
+    {file = "protobuf-5.29.4-cp38-cp38-win_amd64.whl", hash = "sha256:476cb7b14914c780605a8cf62e38c2a85f8caff2e28a6a0bad827ec7d6c85d68"},
-    {file = "protobuf-5.29.3-cp39-cp39-win32.whl", hash = "sha256:0eb32bfa5219fc8d4111803e9a690658aa2e6366384fd0851064b963b6d1f2a7"},
+    {file = "protobuf-5.29.4-cp39-cp39-win32.whl", hash = "sha256:fd32223020cb25a2cc100366f1dedc904e2d71d9322403224cdde5fdced0dabe"},
-    {file = "protobuf-5.29.3-cp39-cp39-win_amd64.whl", hash = "sha256:6ce8cc3389a20693bfde6c6562e03474c40851b44975c9b2bf6df7d8c4f864da"},
+    {file = "protobuf-5.29.4-cp39-cp39-win_amd64.whl", hash = "sha256:678974e1e3a9b975b8bc2447fca458db5f93a2fb6b0c8db46b6675b5b5346812"},
-    {file = "protobuf-5.29.3-py3-none-any.whl", hash = "sha256:0a18ed4a24198528f2333802eb075e59dea9d679ab7a6c5efb017a59004d849f"},
+    {file = "protobuf-5.29.4-py3-none-any.whl", hash = "sha256:3fde11b505e1597f71b875ef2fc52062b6a9740e5f7c8997ce878b6009145862"},
-    {file = "protobuf-5.29.3.tar.gz", hash = "sha256:5da0f41edaf117bde316404bad1a486cb4ededf8e4a54891296f648e8e076620"},
+    {file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"},
 ]
 [[package]]
@ -2809,6 +2810,25 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 [package.extras]
 testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
 [[package]]
 name = "pytest-asyncio"
 version = "0.21.2"
 description = "Pytest support for asyncio"
 optional = false
 python-versions = ">=3.7"
 groups = ["dev"]
 files = [
    {file = "pytest_asyncio-0.21.2-py3-none-any.whl", hash = "sha256:ab664c88bb7998f711d8039cacd4884da6430886ae8bbd4eded552ed2004f16b"},
    {file = "pytest_asyncio-0.21.2.tar.gz", hash = "sha256:d67738fc232b94b326b9d060750beb16e0074210b98dd8b58a5239fa2a154f45"},
 ]
 [package.dependencies]
 pytest = ">=7.0.0"
 [package.extras]
 docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"]
 testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"]
 [[package]]
 name = "pytest-mock"
 version = "3.14.0"
@ -3279,15 +3299,15 @@ files = [
 [[package]]
 name = "rq"
-version = "2.1.0"
+version = "2.2.0"
 description = "RQ is a simple, lightweight, library for creating background jobs, and processing them."
 optional = true
 python-versions = ">=3.8"
 groups = ["main"]
 markers = "extra == \"proxy\""
 files = [
-    {file = "rq-2.1.0-py3-none-any.whl", hash = "sha256:3c6892c6ca848e5fb47c1875399a66f13656bf0e123bf725d9aa9a12718e2fdf"},
+    {file = "rq-2.2.0-py3-none-any.whl", hash = "sha256:dacbfe1ccb79a45c8cd95dec7951620679fa0195570b63da3f9347622d33accc"},
-    {file = "rq-2.1.0.tar.gz", hash = "sha256:764585b6cab69ef1412f4aee523347e5aa7ece3ca175c118b1d92223dd8c2826"},
+    {file = "rq-2.2.0.tar.gz", hash = "sha256:b636760f1e4c183022031c142faa0483e687885824e9732ba2953f994104e203"},
 ]
 [package.dependencies]
@ -3606,15 +3626,15 @@ files = [
 [[package]]
 name = "tzdata"
-version = "2025.1"
+version = "2025.2"
 description = "Provider of IANA time zone data"
 optional = true
 python-versions = ">=2"
 groups = ["main"]
 markers = "extra == \"proxy\" and platform_system == \"Windows\""
 files = [
-    {file = "tzdata-2025.1-py2.py3-none-any.whl", hash = "sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639"},
+    {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"},
-    {file = "tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694"},
+    {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"},
 ]
 [[package]]
@ -3985,4 +4005,4 @@ proxy = ["PyJWT", "apscheduler", "backoff", "boto3", "cryptography", "fastapi",
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.8.1,<4.0, !=3.9.7"
-content-hash = "f7c21b3d659e4a15cd46bb42fb905ad039028f4f6b82507fd1278ac05c412569"
+content-hash = "9c863b11189227a035a9130c8872de44fe7c5e1e32b47569a56af86e3f6570c5"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.63.14"
+version = "1.64.0"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -98,13 +98,14 @@ black = "^23.12.0"
 mypy = "^1.0"
 pytest = "^7.4.3"
 pytest-mock = "^3.12.0"
 pytest-asyncio = "^0.21.1"
 [build-system]
 requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 [tool.commitizen]
-version = "1.63.14"
+version = "1.64.0"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -9,8 +9,8 @@ uvicorn==0.29.0 # server dep
 gunicorn==23.0.0 # server dep
 uvloop==0.21.0 # uvicorn dep, gives us much better performance under load
 boto3==1.34.34 # aws bedrock/sagemaker calls
-redis==5.0.0 # caching
+redis==5.2.1 # redis caching
-numpy==2.1.1 # semantic caching
+redisvl==0.4.1 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 pynacl==1.5.0 # for encrypting keys
--- a/tests/litellm/caching/test_redis_cache.py
+++ b/tests/litellm/caching/test_redis_cache.py
@ -1,13 +1,8 @@
 import asyncio
 import json
 import os
 import sys
 import time
 from unittest.mock import MagicMock, patch
 import httpx
 import pytest
 import respx
 from fastapi.testclient import TestClient
 sys.path.insert(
@ -18,9 +13,18 @@ from unittest.mock import AsyncMock
 from litellm.caching.redis_cache import RedisCache
@pytest.fixture
 def redis_no_ping():
    """Patch RedisCache initialization to prevent async ping tasks from being created"""
    with patch('asyncio.get_running_loop') as mock_get_loop:
        # Either raise an exception or return a mock that will handle the task creation
        mock_get_loop.side_effect = RuntimeError("No running event loop")
        yield
@pytest.mark.parametrize("namespace", [None, "test"])
@pytest.mark.asyncio
-async def test_redis_cache_async_increment(namespace, monkeypatch):
+async def test_redis_cache_async_increment(namespace, monkeypatch, redis_no_ping):
    monkeypatch.setenv("REDIS_HOST", "https://my-test-host")
    redis_cache = RedisCache(namespace=namespace)
    # Create an AsyncMock for the Redis client
@ -47,10 +51,46 @@ async def test_redis_cache_async_increment(namespace, monkeypatch):
@pytest.mark.asyncio
-async def test_redis_client_init_with_socket_timeout(monkeypatch):
+async def test_redis_client_init_with_socket_timeout(monkeypatch, redis_no_ping):
    monkeypatch.setenv("REDIS_HOST", "my-fake-host")
    redis_cache = RedisCache(socket_timeout=1.0)
    assert redis_cache.redis_kwargs["socket_timeout"] == 1.0
    client = redis_cache.init_async_client()
    assert client is not None
    assert client.connection_pool.connection_kwargs["socket_timeout"] == 1.0
@pytest.mark.asyncio
 async def test_redis_cache_async_batch_get_cache(monkeypatch, redis_no_ping):
    monkeypatch.setenv("REDIS_HOST", "https://my-test-host")
    redis_cache = RedisCache()
    # Create an AsyncMock for the Redis client
    mock_redis_instance = AsyncMock()
    # Make sure the mock can be used as an async context manager
    mock_redis_instance.__aenter__.return_value = mock_redis_instance
    mock_redis_instance.__aexit__.return_value = None
    # Setup the return value for mget
    mock_redis_instance.mget.return_value = [
        b'{"key1": "value1"}',
        None,
        b'{"key3": "value3"}'
    ]
    test_keys = ["key1", "key2", "key3"]
    with patch.object(
        redis_cache, "init_async_client", return_value=mock_redis_instance
    ):
        # Call async_batch_get_cache
        result = await redis_cache.async_batch_get_cache(key_list=test_keys)
        # Verify mget was called with the correct keys
        mock_redis_instance.mget.assert_called_once()
        # Check that results were properly decoded
        assert result["key1"] == {"key1": "value1"}
        assert result["key2"] is None
        assert result["key3"] == {"key3": "value3"}
--- a/tests/litellm/caching/test_redis_semantic_cache.py
+++ b/tests/litellm/caching/test_redis_semantic_cache.py
@ -0,0 +1,130 @@
 import os
 import sys
 from unittest.mock import MagicMock, patch, AsyncMock
 import pytest
 sys.path.insert(
    0, os.path.abspath("../../..")
 )  # Adds the parent directory to the system path
 # Tests for RedisSemanticCache
 def test_redis_semantic_cache_initialization(monkeypatch):
    # Mock the redisvl import
    semantic_cache_mock = MagicMock()
    with patch.dict("sys.modules", {
        "redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
        "redisvl.utils.vectorize": MagicMock(CustomTextVectorizer=MagicMock())
    }):
        from litellm.caching.redis_semantic_cache import RedisSemanticCache
        # Set environment variables
        monkeypatch.setenv("REDIS_HOST", "localhost")
        monkeypatch.setenv("REDIS_PORT", "6379")
        monkeypatch.setenv("REDIS_PASSWORD", "test_password")
        # Initialize the cache with a similarity threshold
        redis_semantic_cache = RedisSemanticCache(similarity_threshold=0.8)
        # Verify the semantic cache was initialized with correct parameters
        assert redis_semantic_cache.similarity_threshold == 0.8
        # Use pytest.approx for floating point comparison to handle precision issues
        assert redis_semantic_cache.distance_threshold == pytest.approx(0.2, abs=1e-10)
        assert redis_semantic_cache.embedding_model == "text-embedding-ada-002"
        # Test initialization with missing similarity_threshold
        with pytest.raises(ValueError, match="similarity_threshold must be provided"):
            RedisSemanticCache()
 def test_redis_semantic_cache_get_cache(monkeypatch):
    # Mock the redisvl import and embedding function
    semantic_cache_mock = MagicMock()
    custom_vectorizer_mock = MagicMock()
    with patch.dict("sys.modules", {
        "redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
        "redisvl.utils.vectorize": MagicMock(CustomTextVectorizer=custom_vectorizer_mock)
    }):
        from litellm.caching.redis_semantic_cache import RedisSemanticCache
        # Set environment variables
        monkeypatch.setenv("REDIS_HOST", "localhost")
        monkeypatch.setenv("REDIS_PORT", "6379")
        monkeypatch.setenv("REDIS_PASSWORD", "test_password")
        # Initialize cache
        redis_semantic_cache = RedisSemanticCache(similarity_threshold=0.8)
        # Mock the llmcache.check method to return a result
        mock_result = [
            {
                "prompt": "What is the capital of France?",
                "response": '{"content": "Paris is the capital of France."}',
                "vector_distance": 0.1  # Distance of 0.1 means similarity of 0.9
            }
        ]
        redis_semantic_cache.llmcache.check = MagicMock(return_value=mock_result)
        # Mock the embedding function
        with patch("litellm.embedding", return_value={"data": [{"embedding": [0.1, 0.2, 0.3]}]}):
            # Test get_cache with a message
            result = redis_semantic_cache.get_cache(
                key="test_key",
                messages=[{"content": "What is the capital of France?"}]
            )
            # Verify result is properly parsed
            assert result == {"content": "Paris is the capital of France."}
            # Verify llmcache.check was called
            redis_semantic_cache.llmcache.check.assert_called_once()
@pytest.mark.asyncio
 async def test_redis_semantic_cache_async_get_cache(monkeypatch):
    # Mock the redisvl import
    semantic_cache_mock = MagicMock()
    custom_vectorizer_mock = MagicMock()
    with patch.dict("sys.modules", {
        "redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
        "redisvl.utils.vectorize": MagicMock(CustomTextVectorizer=custom_vectorizer_mock)
    }):
        from litellm.caching.redis_semantic_cache import RedisSemanticCache
        # Set environment variables
        monkeypatch.setenv("REDIS_HOST", "localhost")
        monkeypatch.setenv("REDIS_PORT", "6379")
        monkeypatch.setenv("REDIS_PASSWORD", "test_password")
        # Initialize cache
        redis_semantic_cache = RedisSemanticCache(similarity_threshold=0.8)
        # Mock the async methods
        mock_result = [
            {
                "prompt": "What is the capital of France?",
                "response": '{"content": "Paris is the capital of France."}',
                "vector_distance": 0.1  # Distance of 0.1 means similarity of 0.9
            }
        ]
        redis_semantic_cache.llmcache.acheck = AsyncMock(return_value=mock_result)
        redis_semantic_cache._get_async_embedding = AsyncMock(return_value=[0.1, 0.2, 0.3])
        # Test async_get_cache with a message
        result = await redis_semantic_cache.async_get_cache(
            key="test_key",
            messages=[{"content": "What is the capital of France?"}],
            metadata={}
        )
        # Verify result is properly parsed
        assert result == {"content": "Paris is the capital of France."}
        # Verify methods were called
        redis_semantic_cache._get_async_embedding.assert_called_once()
        redis_semantic_cache.llmcache.acheck.assert_called_once() 
--- a/tests/litellm/litellm_core_utils/llm_cost_calc/test_tool_call_cost_tracking.py
+++ b/tests/litellm/litellm_core_utils/llm_cost_calc/test_tool_call_cost_tracking.py
@ -0,0 +1,113 @@
 import json
 import os
 import sys
 import pytest
 from fastapi.testclient import TestClient
 import litellm
 from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
    StandardBuiltInToolCostTracking,
 )
 from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
 from litellm.types.utils import ModelInfo, ModelResponse, StandardBuiltInToolsParams
 sys.path.insert(
    0, os.path.abspath("../../..")
 )  # Adds the parent directory to the system path
 # Test basic web search cost calculations
 def test_web_search_cost_low():
    web_search_options = WebSearchOptions(search_context_size="low")
    model_info = litellm.get_model_info("gpt-4o-search-preview")
    cost = StandardBuiltInToolCostTracking.get_cost_for_web_search(
        web_search_options=web_search_options, model_info=model_info
    )
    assert (
        cost == model_info["search_context_cost_per_query"]["search_context_size_low"]
    )
 def test_web_search_cost_medium():
    web_search_options = WebSearchOptions(search_context_size="medium")
    model_info = litellm.get_model_info("gpt-4o-search-preview")
    cost = StandardBuiltInToolCostTracking.get_cost_for_web_search(
        web_search_options=web_search_options, model_info=model_info
    )
    assert (
        cost
        == model_info["search_context_cost_per_query"]["search_context_size_medium"]
    )
 def test_web_search_cost_high():
    web_search_options = WebSearchOptions(search_context_size="high")
    model_info = litellm.get_model_info("gpt-4o-search-preview")
    cost = StandardBuiltInToolCostTracking.get_cost_for_web_search(
        web_search_options=web_search_options, model_info=model_info
    )
    assert (
        cost == model_info["search_context_cost_per_query"]["search_context_size_high"]
    )
 # Test file search cost calculation
 def test_file_search_cost():
    file_search = FileSearchTool(type="file_search")
    cost = StandardBuiltInToolCostTracking.get_cost_for_file_search(
        file_search=file_search
    )
    assert cost == 0.0025  # $2.50/1000 calls = 0.0025 per call
 # Test edge cases
 def test_none_inputs():
    # Test with None inputs
    assert (
        StandardBuiltInToolCostTracking.get_cost_for_web_search(
            web_search_options=None, model_info=None
        )
        == 0.0
    )
    assert (
        StandardBuiltInToolCostTracking.get_cost_for_file_search(file_search=None)
        == 0.0
    )
 # Test the main get_cost_for_built_in_tools method
 def test_get_cost_for_built_in_tools_web_search():
    model = "gpt-4"
    standard_built_in_tools_params = StandardBuiltInToolsParams(
        web_search_options=WebSearchOptions(search_context_size="medium")
    )
    cost = StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
        model=model,
        response_object=None,
        standard_built_in_tools_params=standard_built_in_tools_params,
    )
    assert isinstance(cost, float)
 def test_get_cost_for_built_in_tools_file_search():
    model = "gpt-4"
    standard_built_in_tools_params = StandardBuiltInToolsParams(
        file_search=FileSearchTool(type="file_search")
    )
    cost = StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
        model=model,
        response_object=None,
        standard_built_in_tools_params=standard_built_in_tools_params,
    )
    assert cost == 0.0025
--- a/tests/litellm/litellm_core_utils/test_litellm_logging.py
+++ b/tests/litellm/litellm_core_utils/test_litellm_logging.py
@ -0,0 +1,34 @@
 import json
 import os
 import sys
 from unittest.mock import MagicMock, patch
 import pytest
 sys.path.insert(
    0, os.path.abspath("../../..")
 )  # Adds the parent directory to the system path
 import time
 from litellm.litellm_core_utils.litellm_logging import Logging as LitellmLogging
@pytest.fixture
 def logging_obj():
    return LitellmLogging(
        model="bedrock/claude-3-5-sonnet-20240620-v1:0",
        messages=[{"role": "user", "content": "Hey"}],
        stream=True,
        call_type="completion",
        start_time=time.time(),
        litellm_call_id="12345",
        function_id="1245",
    )
 def test_get_masked_api_base(logging_obj):
    api_base = "https://api.openai.com/v1"
    masked_api_base = logging_obj._get_masked_api_base(api_base)
    assert masked_api_base == "https://api.openai.com/v1"
    assert type(masked_api_base) == str
--- a/tests/litellm/proxy/spend_tracking/test_spend_management_endpoints.py
+++ b/tests/litellm/proxy/spend_tracking/test_spend_management_endpoints.py
@ -1,3 +1,4 @@
 import asyncio
 import datetime
 import json
 import os
@ -11,7 +12,13 @@ sys.path.insert(
    0, os.path.abspath("../../../..")
 )  # Adds the parent directory to the system path
 from unittest.mock import MagicMock, patch
 import litellm
 from litellm.proxy._types import SpendLogsPayload
 from litellm.proxy.hooks.proxy_track_cost_callback import _ProxyDBLogger
 from litellm.proxy.proxy_server import app, prisma_client
 from litellm.router import Router
@pytest.fixture
@ -400,3 +407,270 @@ async def test_ui_view_spend_logs_unauthorized(client):
        headers={"Authorization": "Bearer invalid-token"},
    )
    assert response.status_code == 401 or response.status_code == 403
 class TestSpendLogsPayload:
    @pytest.mark.asyncio
    async def test_spend_logs_payload_e2e(self):
        litellm.callbacks = [_ProxyDBLogger(message_logging=False)]
        # litellm._turn_on_debug()
        with patch.object(
            litellm.proxy.proxy_server, "_set_spend_logs_payload"
        ) as mock_client, patch.object(litellm.proxy.proxy_server, "prisma_client"):
            response = await litellm.acompletion(
                model="gpt-4o",
                messages=[{"role": "user", "content": "Hello, world!"}],
                mock_response="Hello, world!",
                metadata={"user_api_key_end_user_id": "test_user_1"},
            )
            assert response.choices[0].message.content == "Hello, world!"
            await asyncio.sleep(1)
            mock_client.assert_called_once()
            kwargs = mock_client.call_args.kwargs
            payload: SpendLogsPayload = kwargs["payload"]
            expected_payload = SpendLogsPayload(
                **{
                    "request_id": "chatcmpl-34df56d5-4807-45c1-bb99-61e52586b802",
                    "call_type": "acompletion",
                    "api_key": "",
                    "cache_hit": "None",
                    "startTime": datetime.datetime(
                        2025, 3, 24, 22, 2, 42, 975883, tzinfo=datetime.timezone.utc
                    ),
                    "endTime": datetime.datetime(
                        2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
                    ),
                    "completionStartTime": datetime.datetime(
                        2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
                    ),
                    "model": "gpt-4o",
                    "user": "",
                    "team_id": "",
                    "metadata": '{"applied_guardrails": [], "batch_models": null, "additional_usage_values": {"completion_tokens_details": null, "prompt_tokens_details": null}}',
                    "cache_key": "Cache OFF",
                    "spend": 0.00022500000000000002,
                    "total_tokens": 30,
                    "prompt_tokens": 10,
                    "completion_tokens": 20,
                    "request_tags": "[]",
                    "end_user": "test_user_1",
                    "api_base": "",
                    "model_group": "",
                    "model_id": "",
                    "requester_ip_address": None,
                    "custom_llm_provider": "openai",
                    "messages": "{}",
                    "response": "{}",
                }
            )
            for key, value in expected_payload.items():
                if key in [
                    "request_id",
                    "startTime",
                    "endTime",
                    "completionStartTime",
                    "endTime",
                ]:
                    assert payload[key] is not None
                else:
                    assert (
                        payload[key] == value
                    ), f"Expected {key} to be {value}, but got {payload[key]}"
    def mock_anthropic_response(*args, **kwargs):
        mock_response = MagicMock()
        mock_response.status_code = 200
        mock_response.headers = {"Content-Type": "application/json"}
        mock_response.json.return_value = {
            "content": [{"text": "Hi! My name is Claude.", "type": "text"}],
            "id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
            "model": "claude-3-7-sonnet-20250219",
            "role": "assistant",
            "stop_reason": "end_turn",
            "stop_sequence": None,
            "type": "message",
            "usage": {"input_tokens": 2095, "output_tokens": 503},
        }
        return mock_response
    @pytest.mark.asyncio
    async def test_spend_logs_payload_success_log_with_api_base(self):
        from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
        litellm.callbacks = [_ProxyDBLogger(message_logging=False)]
        # litellm._turn_on_debug()
        client = AsyncHTTPHandler()
        with patch.object(
            litellm.proxy.proxy_server, "_set_spend_logs_payload"
        ) as mock_client, patch.object(
            litellm.proxy.proxy_server, "prisma_client"
        ), patch.object(
            client, "post", side_effect=self.mock_anthropic_response
        ):
            response = await litellm.acompletion(
                model="claude-3-7-sonnet-20250219",
                messages=[{"role": "user", "content": "Hello, world!"}],
                metadata={"user_api_key_end_user_id": "test_user_1"},
                client=client,
            )
            assert response.choices[0].message.content == "Hi! My name is Claude."
            await asyncio.sleep(1)
            mock_client.assert_called_once()
            kwargs = mock_client.call_args.kwargs
            payload: SpendLogsPayload = kwargs["payload"]
            expected_payload = SpendLogsPayload(
                **{
                    "request_id": "chatcmpl-34df56d5-4807-45c1-bb99-61e52586b802",
                    "call_type": "acompletion",
                    "api_key": "",
                    "cache_hit": "None",
                    "startTime": datetime.datetime(
                        2025, 3, 24, 22, 2, 42, 975883, tzinfo=datetime.timezone.utc
                    ),
                    "endTime": datetime.datetime(
                        2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
                    ),
                    "completionStartTime": datetime.datetime(
                        2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
                    ),
                    "model": "claude-3-7-sonnet-20250219",
                    "user": "",
                    "team_id": "",
                    "metadata": '{"applied_guardrails": [], "batch_models": null, "additional_usage_values": {"completion_tokens_details": null, "prompt_tokens_details": {"audio_tokens": null, "cached_tokens": 0, "text_tokens": null, "image_tokens": null}, "cache_creation_input_tokens": 0, "cache_read_input_tokens": 0}}',
                    "cache_key": "Cache OFF",
                    "spend": 0.01383,
                    "total_tokens": 2598,
                    "prompt_tokens": 2095,
                    "completion_tokens": 503,
                    "request_tags": "[]",
                    "end_user": "test_user_1",
                    "api_base": "https://api.anthropic.com/v1/messages",
                    "model_group": "",
                    "model_id": "",
                    "requester_ip_address": None,
                    "custom_llm_provider": "anthropic",
                    "messages": "{}",
                    "response": "{}",
                }
            )
            for key, value in expected_payload.items():
                if key in [
                    "request_id",
                    "startTime",
                    "endTime",
                    "completionStartTime",
                    "endTime",
                ]:
                    assert payload[key] is not None
                else:
                    assert (
                        payload[key] == value
                    ), f"Expected {key} to be {value}, but got {payload[key]}"
    @pytest.mark.asyncio
    async def test_spend_logs_payload_success_log_with_router(self):
        from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
        litellm.callbacks = [_ProxyDBLogger(message_logging=False)]
        # litellm._turn_on_debug()
        client = AsyncHTTPHandler()
        router = Router(
            model_list=[
                {
                    "model_name": "my-anthropic-model-group",
                    "litellm_params": {
                        "model": "claude-3-7-sonnet-20250219",
                    },
                    "model_info": {
                        "id": "my-unique-model-id",
                    },
                }
            ]
        )
        with patch.object(
            litellm.proxy.proxy_server, "_set_spend_logs_payload"
        ) as mock_client, patch.object(
            litellm.proxy.proxy_server, "prisma_client"
        ), patch.object(
            client, "post", side_effect=self.mock_anthropic_response
        ):
            response = await router.acompletion(
                model="my-anthropic-model-group",
                messages=[{"role": "user", "content": "Hello, world!"}],
                metadata={"user_api_key_end_user_id": "test_user_1"},
                client=client,
            )
            assert response.choices[0].message.content == "Hi! My name is Claude."
            await asyncio.sleep(1)
            mock_client.assert_called_once()
            kwargs = mock_client.call_args.kwargs
            payload: SpendLogsPayload = kwargs["payload"]
            expected_payload = SpendLogsPayload(
                **{
                    "request_id": "chatcmpl-34df56d5-4807-45c1-bb99-61e52586b802",
                    "call_type": "acompletion",
                    "api_key": "",
                    "cache_hit": "None",
                    "startTime": datetime.datetime(
                        2025, 3, 24, 22, 2, 42, 975883, tzinfo=datetime.timezone.utc
                    ),
                    "endTime": datetime.datetime(
                        2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
                    ),
                    "completionStartTime": datetime.datetime(
                        2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
                    ),
                    "model": "claude-3-7-sonnet-20250219",
                    "user": "",
                    "team_id": "",
                    "metadata": '{"applied_guardrails": [], "batch_models": null, "additional_usage_values": {"completion_tokens_details": null, "prompt_tokens_details": {"audio_tokens": null, "cached_tokens": 0, "text_tokens": null, "image_tokens": null}, "cache_creation_input_tokens": 0, "cache_read_input_tokens": 0}}',
                    "cache_key": "Cache OFF",
                    "spend": 0.01383,
                    "total_tokens": 2598,
                    "prompt_tokens": 2095,
                    "completion_tokens": 503,
                    "request_tags": "[]",
                    "end_user": "test_user_1",
                    "api_base": "https://api.anthropic.com/v1/messages",
                    "model_group": "my-anthropic-model-group",
                    "model_id": "my-unique-model-id",
                    "requester_ip_address": None,
                    "custom_llm_provider": "anthropic",
                    "messages": "{}",
                    "response": "{}",
                }
            )
            for key, value in expected_payload.items():
                if key in [
                    "request_id",
                    "startTime",
                    "endTime",
                    "completionStartTime",
                    "endTime",
                ]:
                    assert payload[key] is not None
                else:
                    assert (
                        payload[key] == value
                    ), f"Expected {key} to be {value}, but got {payload[key]}"
--- a/tests/litellm_utils_tests/test_utils.py
+++ b/tests/litellm_utils_tests/test_utils.py
@ -477,6 +477,25 @@ def test_supports_function_calling(model, expected_bool):
        pytest.fail(f"Error occurred: {e}")
@pytest.mark.parametrize(
    "model, expected_bool",
    [
        ("gpt-4o-mini-search-preview", True),
        ("openai/gpt-4o-mini-search-preview", True),
        ("gpt-4o-search-preview", True),
        ("openai/gpt-4o-search-preview", True),
        ("groq/deepseek-r1-distill-llama-70b", False),
        ("groq/llama-3.3-70b-versatile", False),
        ("codestral/codestral-latest", False),
    ],
 )
 def test_supports_web_search(model, expected_bool):
    try:
        assert litellm.supports_web_search(model=model) == expected_bool
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_get_max_token_unit_test():
    """
    More complete testing in `test_completion_cost.py`
--- a/tests/local_testing/test_caching.py
+++ b/tests/local_testing/test_caching.py
@ -794,7 +794,7 @@ def test_redis_cache_completion():
    response3 = completion(
        model="gpt-3.5-turbo", messages=messages, caching=True, temperature=0.5
    )
-    response4 = completion(model="azure/chatgpt-v-2", messages=messages, caching=True)
+    response4 = completion(model="gpt-4o-mini", messages=messages, caching=True)
    print("\nresponse 1", response1)
    print("\nresponse 2", response2)
@ -1690,20 +1690,12 @@ def test_cache_context_managers():
    print("VARS of litellm.cache", vars(litellm.cache))
 # test_cache_context_managers()
@pytest.mark.skip(reason="beta test - new redis semantic cache")
 def test_redis_semantic_cache_completion():
    litellm.set_verbose = True
    import logging
    logging.basicConfig(level=logging.DEBUG)
    random_number = random.randint(
        1, 100000
    )  # add a random number to ensure it's always adding /reading from cache
    print("testing semantic caching")
    litellm.cache = Cache(
        type="redis-semantic",
@ -1718,33 +1710,30 @@ def test_redis_semantic_cache_completion():
        messages=[
            {
                "role": "user",
-                "content": f"write a one sentence poem about: {random_number}",
+                "content": "write a one sentence poem about summer",
            }
        ],
        max_tokens=20,
    )
    print(f"response1: {response1}")
    random_number = random.randint(1, 100000)
    response2 = completion(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
-                "content": f"write a one sentence poem about: {random_number}",
+                "content": "write a one sentence poem about summertime",
            }
        ],
        max_tokens=20,
    )
-    print(f"response2: {response1}")
+    print(f"response2: {response2}")
    assert response1.id == response2.id
 # test_redis_cache_completion()
@pytest.mark.skip(reason="beta test - new redis semantic cache")
@pytest.mark.asyncio
 async def test_redis_semantic_cache_acompletion():
    litellm.set_verbose = True
@ -1752,38 +1741,32 @@ async def test_redis_semantic_cache_acompletion():
    logging.basicConfig(level=logging.DEBUG)
    random_number = random.randint(
        1, 100000
    )  # add a random number to ensure it's always adding / reading from cache
    print("testing semantic caching")
    litellm.cache = Cache(
        type="redis-semantic",
        host=os.environ["REDIS_HOST"],
        port=os.environ["REDIS_PORT"],
        password=os.environ["REDIS_PASSWORD"],
-        similarity_threshold=0.8,
+        similarity_threshold=0.7,
        redis_semantic_cache_use_async=True,
    )
    response1 = await litellm.acompletion(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
-                "content": f"write a one sentence poem about: {random_number}",
+                "content": "write a one sentence poem about summer",
            }
        ],
        max_tokens=5,
    )
    print(f"response1: {response1}")
    random_number = random.randint(1, 100000)
    response2 = await litellm.acompletion(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
-                "content": f"write a one sentence poem about: {random_number}",
+                "content": "write a one sentence poem about summertime",
            }
        ],
        max_tokens=5,
--- a/tests/logging_callback_tests/gcs_pub_sub_body/standard_logging_payload.json
+++ b/tests/logging_callback_tests/gcs_pub_sub_body/standard_logging_payload.json
@ -0,0 +1,175 @@
 {
    "id": "chatcmpl-2299b6a2-82a3-465a-b47c-04e685a2227f",
    "trace_id": null,
    "call_type": "acompletion",
    "cache_hit": null,
    "stream": true,
    "status": "success",
    "custom_llm_provider": "openai",
    "saved_cache_cost": 0.0,
    "startTime": "2025-01-24 09:20:46.847371",
    "endTime": "2025-01-24 09:20:46.851954",
    "completionStartTime": "2025-01-24 09:20:46.851954",
    "response_time": 0.007394075393676758,
    "model": "gpt-4o",
    "metadata": {
      "user_api_key_hash": null,
      "user_api_key_alias": null,
      "user_api_key_team_id": null,
      "user_api_key_org_id": null,
      "user_api_key_user_id": null,
      "user_api_key_team_alias": null,
      "user_api_key_user_email": null,
      "spend_logs_metadata": null,
      "requester_ip_address": null,
      "requester_metadata": null,
      "user_api_key_end_user_id": null,
      "prompt_management_metadata": null,
      "applied_guardrails": []
    },
    "cache_key": null,
    "response_cost": 0.00022500000000000002,
    "total_tokens": 30,
    "prompt_tokens": 10,
    "completion_tokens": 20,
    "request_tags": [],
    "end_user": "",
    "api_base": "",
    "model_group": "",
    "model_id": "",
    "requester_ip_address": null,
    "messages": [
      {
        "role": "user",
        "content": "Hello, world!"
      }
    ],
    "response": {
      "id": "chatcmpl-2299b6a2-82a3-465a-b47c-04e685a2227f",
      "created": 1742855151,
      "model": "gpt-4o",
      "object": "chat.completion",
      "system_fingerprint": null,
      "choices": [
        {
          "finish_reason": "stop",
          "index": 0,
          "message": {
            "content": "hi",
            "role": "assistant",
            "tool_calls": null,
            "function_call": null
          }
        }
      ],
      "usage": {
        "completion_tokens": 20,
        "prompt_tokens": 10,
        "total_tokens": 30,
        "completion_tokens_details": null,
        "prompt_tokens_details": null
      }
    },
    "model_parameters": {},
    "hidden_params": {
      "model_id": null,
      "cache_key": null,
      "api_base": "https://api.openai.com",
      "response_cost": 0.00022500000000000002,
      "additional_headers": {},
      "litellm_overhead_time_ms": null,
      "batch_models": null,
      "litellm_model_name": "gpt-4o"
    },
    "model_map_information": {
      "model_map_key": "gpt-4o",
      "model_map_value": {
        "key": "gpt-4o",
        "max_tokens": 16384,
        "max_input_tokens": 128000,
        "max_output_tokens": 16384,
        "input_cost_per_token": 2.5e-06,
        "cache_creation_input_token_cost": null,
        "cache_read_input_token_cost": 1.25e-06,
        "input_cost_per_character": null,
        "input_cost_per_token_above_128k_tokens": null,
        "input_cost_per_query": null,
        "input_cost_per_second": null,
        "input_cost_per_audio_token": null,
        "input_cost_per_token_batches": 1.25e-06,
        "output_cost_per_token_batches": 5e-06,
        "output_cost_per_token": 1e-05,
        "output_cost_per_audio_token": null,
        "output_cost_per_character": null,
        "output_cost_per_token_above_128k_tokens": null,
        "output_cost_per_character_above_128k_tokens": null,
        "output_cost_per_second": null,
        "output_cost_per_image": null,
        "output_vector_size": null,
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_system_messages": true,
        "supports_response_schema": true,
        "supports_vision": true,
        "supports_function_calling": true,
        "supports_tool_choice": true,
        "supports_assistant_prefill": false,
        "supports_prompt_caching": true,
        "supports_audio_input": false,
        "supports_audio_output": false,
        "supports_pdf_input": false,
        "supports_embedding_image_input": false,
        "supports_native_streaming": null,
        "supports_web_search": true,
        "search_context_cost_per_query": {
          "search_context_size_low": 0.03,
          "search_context_size_medium": 0.035,
          "search_context_size_high": 0.05
        },
        "tpm": null,
        "rpm": null,
        "supported_openai_params": [
          "frequency_penalty",
          "logit_bias",
          "logprobs",
          "top_logprobs",
          "max_tokens",
          "max_completion_tokens",
          "modalities",
          "prediction",
          "n",
          "presence_penalty",
          "seed",
          "stop",
          "stream",
          "stream_options",
          "temperature",
          "top_p",
          "tools",
          "tool_choice",
          "function_call",
          "functions",
          "max_retries",
          "extra_headers",
          "parallel_tool_calls",
          "audio",
          "response_format",
          "user"
        ]
      }
    },
    "error_str": null,
    "error_information": {
      "error_code": "",
      "error_class": "",
      "llm_provider": "",
      "traceback": "",
      "error_message": ""
    },
    "response_cost_failure_debug_info": null,
    "guardrail_information": null,
    "standard_built_in_tools_params": {
      "web_search_options": null,
      "file_search": null
    }
  }
--- a/tests/logging_callback_tests/test_built_in_tools_cost_tracking.py
+++ b/tests/logging_callback_tests/test_built_in_tools_cost_tracking.py
@ -0,0 +1,151 @@
 import os
 import sys
 import traceback
 import uuid
 import pytest
 from dotenv import load_dotenv
 from fastapi import Request
 from fastapi.routing import APIRoute
 load_dotenv()
 import io
 import os
 import time
 import json
 # this file is to test litellm/proxy
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 import asyncio
 from typing import Optional
 from litellm.types.utils import StandardLoggingPayload, Usage, ModelInfoBase
 from litellm.integrations.custom_logger import CustomLogger
 class TestCustomLogger(CustomLogger):
    def __init__(self):
        self.recorded_usage: Optional[Usage] = None
        self.standard_logging_payload: Optional[StandardLoggingPayload] = None
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        standard_logging_payload = kwargs.get("standard_logging_object")
        self.standard_logging_payload = standard_logging_payload
        print(
            "standard_logging_payload",
            json.dumps(standard_logging_payload, indent=4, default=str),
        )
        self.recorded_usage = Usage(
            prompt_tokens=standard_logging_payload.get("prompt_tokens"),
            completion_tokens=standard_logging_payload.get("completion_tokens"),
            total_tokens=standard_logging_payload.get("total_tokens"),
        )
        pass
 async def _setup_web_search_test():
    """Helper function to setup common test requirements"""
    litellm._turn_on_debug()
    test_custom_logger = TestCustomLogger()
    litellm.callbacks = [test_custom_logger]
    return test_custom_logger
 async def _verify_web_search_cost(test_custom_logger, expected_context_size):
    """Helper function to verify web search costs"""
    await asyncio.sleep(1)
    standard_logging_payload = test_custom_logger.standard_logging_payload
    response_cost = standard_logging_payload.get("response_cost")
    assert response_cost is not None
    # Calculate token cost
    model_map_information = standard_logging_payload["model_map_information"]
    model_map_value: ModelInfoBase = model_map_information["model_map_value"]
    total_token_cost = (
        standard_logging_payload["prompt_tokens"]
        * model_map_value["input_cost_per_token"]
    ) + (
        standard_logging_payload["completion_tokens"]
        * model_map_value["output_cost_per_token"]
    )
    # Verify total cost
    assert (
        response_cost
        == total_token_cost
        + model_map_value["search_context_cost_per_query"][expected_context_size]
    )
@pytest.mark.asyncio
@pytest.mark.parametrize(
    "web_search_options,expected_context_size",
    [
        (None, "search_context_size_medium"),
        ({"search_context_size": "low"}, "search_context_size_low"),
        ({"search_context_size": "high"}, "search_context_size_high"),
    ],
 )
 async def test_openai_web_search_logging_cost_tracking(
    web_search_options, expected_context_size
 ):
    """Test web search cost tracking with different search context sizes"""
    test_custom_logger = await _setup_web_search_test()
    request_kwargs = {
        "model": "openai/gpt-4o-search-preview",
        "messages": [
            {"role": "user", "content": "What was a positive news story from today?"}
        ],
    }
    if web_search_options is not None:
        request_kwargs["web_search_options"] = web_search_options
    response = await litellm.acompletion(**request_kwargs)
    await _verify_web_search_cost(test_custom_logger, expected_context_size)
@pytest.mark.asyncio
@pytest.mark.parametrize(
    "tools_config,expected_context_size,stream",
    [
        (
            [{"type": "web_search_preview", "search_context_size": "low"}],
            "search_context_size_low",
            True,
        ),
        (
            [{"type": "web_search_preview", "search_context_size": "low"}],
            "search_context_size_low",
            False,
        ),
        ([{"type": "web_search_preview"}], "search_context_size_medium", True),
        ([{"type": "web_search_preview"}], "search_context_size_medium", False),
    ],
 )
 async def test_openai_responses_api_web_search_cost_tracking(
    tools_config, expected_context_size, stream
 ):
    """Test web search cost tracking with different search context sizes and streaming options"""
    test_custom_logger = await _setup_web_search_test()
    response = await litellm.aresponses(
        model="openai/gpt-4o",
        input=[
            {"role": "user", "content": "What was a positive news story from today?"}
        ],
        tools=tools_config,
        stream=stream,
    )
    if stream is True:
        async for chunk in response:
            print("chunk", chunk)
    else:
        print("response", response)
    await _verify_web_search_cost(test_custom_logger, expected_context_size)
--- a/tests/logging_callback_tests/test_gcs_pub_sub.py
+++ b/tests/logging_callback_tests/test_gcs_pub_sub.py
@ -6,6 +6,7 @@ import sys
 sys.path.insert(0, os.path.abspath("../.."))
 import asyncio
 import litellm
 import gzip
 import json
 import logging
@ -48,8 +49,15 @@ def assert_gcs_pubsub_request_matches_expected(
        expected_request_body = json.load(f)
    # Replace dynamic values in actual request body
-    time_fields = ["startTime", "endTime", "completionStartTime", "request_id"]
+    dynamic_fields = [
-    for field in time_fields:
+        "startTime",
        "endTime",
        "completionStartTime",
        "request_id",
        "id",
        "response_time",
    ]
    for field in dynamic_fields:
        if field in actual_request_body:
            actual_request_body[field] = expected_request_body[field]
@ -59,6 +67,55 @@ def assert_gcs_pubsub_request_matches_expected(
    ), f"Difference in request bodies: {json.dumps(actual_request_body, indent=2)} != {json.dumps(expected_request_body, indent=2)}"
 def assert_gcs_pubsub_request_matches_expected_standard_logging_payload(
    actual_request_body: dict,
    expected_file_name: str,
 ):
    """
    Helper function to compare actual GCS PubSub request body with expected JSON file.
    Args:
        actual_request_body (dict): The actual request body received from the API call
        expected_file_name (str): Name of the JSON file containing expected request body
    """
    # Get the current directory and read the expected request body
    pwd = os.path.dirname(os.path.realpath(__file__))
    expected_body_path = os.path.join(pwd, "gcs_pub_sub_body", expected_file_name)
    with open(expected_body_path, "r") as f:
        expected_request_body = json.load(f)
    # Replace dynamic values in actual request body
    FIELDS_TO_VALIDATE = [
        "custom_llm_provider",
        "hidden_params",
        "messages",
        "response",
        "model",
        "status",
        "stream",
    ]
    actual_request_body["response"]["id"] = expected_request_body["response"]["id"]
    actual_request_body["response"]["created"] = expected_request_body["response"][
        "created"
    ]
    for field in FIELDS_TO_VALIDATE:
        assert field in actual_request_body
    FIELDS_EXISTENCE_CHECKS = [
        "response_cost",
        "response_time",
        "completion_tokens",
        "prompt_tokens",
        "total_tokens",
    ]
    for field in FIELDS_EXISTENCE_CHECKS:
        assert field in actual_request_body
@pytest.mark.asyncio
 async def test_async_gcs_pub_sub():
    # Create a mock for the async_httpx_client's post method
@ -102,6 +159,61 @@ async def test_async_gcs_pub_sub():
    decoded_message = base64.b64decode(encoded_message).decode("utf-8")
    # Parse the JSON string into a dictionary
    actual_request = json.loads(decoded_message)
    print("##########\n")
    print(json.dumps(actual_request, indent=4))
    print("##########\n")
    # Verify the request body matches expected format
    assert_gcs_pubsub_request_matches_expected_standard_logging_payload(
        actual_request, "standard_logging_payload.json"
    )
@pytest.mark.asyncio
 async def test_async_gcs_pub_sub_v1():
    # Create a mock for the async_httpx_client's post method
    litellm.gcs_pub_sub_use_v1 = True
    mock_post = AsyncMock()
    mock_post.return_value.status_code = 202
    mock_post.return_value.text = "Accepted"
    # Initialize the GcsPubSubLogger and set the mock
    gcs_pub_sub_logger = GcsPubSubLogger(flush_interval=1)
    gcs_pub_sub_logger.async_httpx_client.post = mock_post
    mock_construct_request_headers = AsyncMock()
    mock_construct_request_headers.return_value = {"Authorization": "Bearer mock_token"}
    gcs_pub_sub_logger.construct_request_headers = mock_construct_request_headers
    litellm.callbacks = [gcs_pub_sub_logger]
    # Make the completion call
    response = await litellm.acompletion(
        model="gpt-4o",
        messages=[{"role": "user", "content": "Hello, world!"}],
        mock_response="hi",
    )
    await asyncio.sleep(3)  # Wait for async flush
    # Assert httpx post was called
    mock_post.assert_called_once()
    # Get the actual request body from the mock
    actual_url = mock_post.call_args[1]["url"]
    print("sent to url", actual_url)
    assert (
        actual_url
        == "https://pubsub.googleapis.com/v1/projects/reliableKeys/topics/litellmDB:publish"
    )
    actual_request = mock_post.call_args[1]["json"]
    # Extract and decode the base64 encoded message
    encoded_message = actual_request["messages"][0]["data"]
    import base64
    decoded_message = base64.b64decode(encoded_message).decode("utf-8")
    # Parse the JSON string into a dictionary
    actual_request = json.loads(decoded_message)
    print("##########\n")
--- a/tests/logging_callback_tests/test_token_counting.py
+++ b/tests/logging_callback_tests/test_token_counting.py
@ -21,16 +21,18 @@ sys.path.insert(
 import litellm
 import asyncio
 from typing import Optional
-from litellm.types.utils import StandardLoggingPayload, Usage
+from litellm.types.utils import StandardLoggingPayload, Usage, ModelInfoBase
 from litellm.integrations.custom_logger import CustomLogger
 class TestCustomLogger(CustomLogger):
    def __init__(self):
        self.recorded_usage: Optional[Usage] = None
        self.standard_logging_payload: Optional[StandardLoggingPayload] = None
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        standard_logging_payload = kwargs.get("standard_logging_object")
        self.standard_logging_payload = standard_logging_payload
        print(
            "standard_logging_payload",
            json.dumps(standard_logging_payload, indent=4, default=str),