Merge branch 'main' into litellm_exp_mcp_server

2025-04-24 18:24:20 +00:00 · 2025-03-24 19:03:56 -07:00 · 2025-03-24 19:03:56 -07:00 · 08a4ba1b7e
commit 08a4ba1b7e
parent 395e4e73d0 d17ab7da2a
58 changed files with 2991 additions and 627 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1855,7 +1855,7 @@ jobs:
          command: |
            docker run -d \
              -p 4000:4000 \
-              -e DATABASE_URL=$PROXY_DATABASE_URL \
+              -e DATABASE_URL=$CLEAN_STORE_MODEL_IN_DB_DATABASE_URL \
              -e STORE_MODEL_IN_DB="True" \
              -e LITELLM_MASTER_KEY="sk-1234" \
              -e LITELLM_LICENSE=$LITELLM_LICENSE \
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@ -4,7 +4,8 @@ python-dotenv
 tiktoken
 importlib_metadata
 cohere
-redis
+redis==5.2.1
+redisvl==0.4.1
 anthropic
 orjson==3.9.15
 pydantic==2.10.2
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
+.python-version
 .venv
 .env
 .newenv
--- a/3
+++ b/3
@ -37,9 +37,6 @@ RUN pip install dist/*.whl
 # install dependencies as wheels
 RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt

-# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0 
-RUN pip install redisvl==0.0.7 --no-deps
-
 # ensure pyjwt is used, not jwt
 RUN pip uninstall jwt -y
 RUN pip uninstall PyJWT -y
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,35 +1,5 @@
 version: "3.11"
 services:
-  litellm:
-    build:
-      context: .
-      args:
-        target: runtime
-    image: ghcr.io/berriai/litellm:main-stable
-    #########################################
-    ## Uncomment these lines to start proxy with a config.yaml file ##
-    # volumes:
-    #  - ./config.yaml:/app/config.yaml <<- this is missing in the docker-compose file currently
-    # command:
-    #  - "--config=/app/config.yaml"
-    ##############################################
-    ports:
-      - "4000:4000" # Map the container port to the host, change the host port if necessary
-    environment:
-        DATABASE_URL: "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
-        STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
-    env_file:
-      - .env # Load local .env file
-    depends_on:
-      - db  # Indicates that this service depends on the 'db' service, ensuring 'db' starts first
-    healthcheck:  # Defines the health check configuration for the container
-      test: [ "CMD", "curl", "-f", "http://localhost:4000/health/liveliness || exit 1" ]  # Command to execute for health check
-      interval: 30s  # Perform health check every 30 seconds
-      timeout: 10s   # Health check command times out after 10 seconds
-      retries: 3     # Retry up to 3 times if health check fails
-      start_period: 40s  # Wait 40 seconds after container start before beginning health checks
-
- 
  db:
    image: postgres:16
    restart: always
@ -46,25 +16,3 @@ services:
      interval: 1s
      timeout: 5s
      retries: 10
-  
-  prometheus:
-    image: prom/prometheus
-    volumes:
-      - prometheus_data:/prometheus
-      - ./prometheus.yml:/etc/prometheus/prometheus.yml
-    ports:
-      - "9090:9090"
-    command:
-      - '--config.file=/etc/prometheus/prometheus.yml'
-      - '--storage.tsdb.path=/prometheus'
-      - '--storage.tsdb.retention.time=15d'
-    restart: always
-
-volumes:
-  prometheus_data:
-    driver: local
-  postgres_data:
-    name: litellm_postgres_data  # Named volume for Postgres data persistence
-
-
-# ...rest of your docker-compose config if any
--- a/docker/Dockerfile.database
+++ b/docker/Dockerfile.database
@ -59,9 +59,6 @@ COPY --from=builder /wheels/ /wheels/
 # Install the built wheel using pip; again using a wildcard if it's the only file
 RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels

-# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0 
-RUN pip install redisvl==0.0.7 --no-deps
-
 # ensure pyjwt is used, not jwt
 RUN pip uninstall jwt -y
 RUN pip uninstall PyJWT -y
--- a/docker/Dockerfile.non_root
+++ b/docker/Dockerfile.non_root
@ -14,7 +14,7 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]

 # Install build dependencies
 RUN apt-get clean && apt-get update && \
-    apt-get install -y gcc python3-dev && \
+    apt-get install -y gcc g++ python3-dev && \
    rm -rf /var/lib/apt/lists/*

 RUN pip install --no-cache-dir --upgrade pip && \
@ -56,10 +56,8 @@ COPY --from=builder /wheels/ /wheels/
 # Install the built wheel using pip; again using a wildcard if it's the only file
 RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels

-# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0
 # ensure pyjwt is used, not jwt
-RUN pip install redisvl==0.0.7 --no-deps --no-cache-dir && \
-    pip uninstall jwt -y && \
+RUN pip uninstall jwt -y && \
    pip uninstall PyJWT -y && \
    pip install PyJWT==2.9.0 --no-cache-dir

--- a/docs/my-website/docs/caching/all_caches.md
+++ b/docs/my-website/docs/caching/all_caches.md
@ -26,7 +26,7 @@ Install redis
 pip install redis
 ```

-For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
+For the hosted version you can setup your own Redis DB here: https://redis.io/try-free/

 ```python
 import litellm
@ -37,11 +37,11 @@ litellm.cache = Cache(type="redis", host=<host>, port=<port>, password=<password

 # Make completion calls
 response1 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": "Tell me a joke."}]
 )
 response2 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": "Tell me a joke."}]
 )

@ -91,12 +91,12 @@ response2 = completion(

 <TabItem value="redis-sem" label="redis-semantic cache">

-Install redis
+Install redisvl client
 ```shell
-pip install redisvl==0.0.7
+pip install redisvl==0.4.1
 ```

-For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
+For the hosted version you can setup your own Redis DB here: https://redis.io/try-free/

 ```python
 import litellm
@ -114,6 +114,7 @@ litellm.cache = Cache(
    port=os.environ["REDIS_PORT"],
    password=os.environ["REDIS_PASSWORD"],
    similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity
+    ttl=120,
    redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
 )
 response1 = completion(
@ -471,11 +472,13 @@ def __init__(
    password: Optional[str] = None,
    namespace: Optional[str] = None,
    default_in_redis_ttl: Optional[float] = None,
-    similarity_threshold: Optional[float] = None,
-    redis_semantic_cache_use_async=False,
-    redis_semantic_cache_embedding_model="text-embedding-ada-002",
    redis_flush_size=None,

+    # redis semantic cache params
+    similarity_threshold: Optional[float] = None,
+    redis_semantic_cache_embedding_model: str = "text-embedding-ada-002",
+    redis_semantic_cache_index_name: Optional[str] = None,
+
    # s3 Bucket, boto3 configuration
    s3_bucket_name: Optional[str] = None,
    s3_region_name: Optional[str] = None,
--- a/docs/my-website/docs/completion/document_understanding.md
+++ b/docs/my-website/docs/completion/document_understanding.md
@ -200,3 +200,92 @@ Expected Response

 </TabItem>
 </Tabs>
+
+
+## OpenAI 'file' message type
+
+This is currently only supported for OpenAI models. 
+
+This will be supported for all providers soon. 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import base64
+from litellm import completion
+
+with open("draconomicon.pdf", "rb") as f:
+    data = f.read()
+
+base64_string = base64.b64encode(data).decode("utf-8")
+
+completion = completion(
+    model="gpt-4o",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "file",
+                    "file": {
+                        "filename": "draconomicon.pdf",
+                        "file_data": f"data:application/pdf;base64,{base64_string}",
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "What is the first dragon in the book?",
+                }
+            ],
+        },
+    ],
+)
+
+print(completion.choices[0].message.content)
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: openai-model
+    litellm_params:
+      model: gpt-4o
+      api_key: os.environ/OPENAI_API_KEY
+```
+
+2. Start the proxy
+
+```bash
+litellm --config config.yaml
+```
+
+3. Test it!
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{ 
+    "model": "openai-model",
+    "messages": [
+        {"role": "user", "content": [
+            {
+                "type": "file",
+                "file": {
+                    "filename": "draconomicon.pdf",
+                    "file_data": f"data:application/pdf;base64,{base64_string}",
+                }
+            }
+        ]}
+    ]
+}'
+```
+
+</TabItem>
+</Tabs>
--- a/docs/my-website/docs/completion/web_search.md
+++ b/docs/my-website/docs/completion/web_search.md
@ -0,0 +1,308 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Using Web Search
+
+Use web search with litellm
+
+| Feature | Details |
+|---------|---------|
+| Supported Endpoints | - `/chat/completions` <br/> - `/responses` |
+| Supported Providers | `openai` |
+| LiteLLM Cost Tracking | ✅ Supported |
+| LiteLLM Version | `v1.63.15-nightly` or higher |
+
+
+## `/chat/completions` (litellm.completion)
+
+### Quick Start
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python showLineNumbers
+from litellm import completion
+
+response = completion(
+    model="openai/gpt-4o-search-preview",
+    messages=[
+        {
+            "role": "user",
+            "content": "What was a positive news story from today?",
+        }
+    ],
+)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: gpt-4o-search-preview
+    litellm_params:
+      model: openai/gpt-4o-search-preview
+      api_key: os.environ/OPENAI_API_KEY
+```
+
+2. Start the proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```python showLineNumbers
+from openai import OpenAI
+
+# Point to your proxy server
+client = OpenAI(
+    api_key="sk-1234",
+    base_url="http://0.0.0.0:4000"
+)
+
+response = client.chat.completions.create(
+    model="gpt-4o-search-preview",
+    messages=[
+        {
+            "role": "user",
+            "content": "What was a positive news story from today?"
+        }
+    ]
+)
+```
+</TabItem>
+</Tabs>
+
+### Search context size
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python showLineNumbers
+from litellm import completion
+
+# Customize search context size
+response = completion(
+    model="openai/gpt-4o-search-preview",
+    messages=[
+        {
+            "role": "user",
+            "content": "What was a positive news story from today?",
+        }
+    ],
+    web_search_options={
+        "search_context_size": "low"  # Options: "low", "medium" (default), "high"
+    }
+)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```python showLineNumbers
+from openai import OpenAI
+
+# Point to your proxy server
+client = OpenAI(
+    api_key="sk-1234",
+    base_url="http://0.0.0.0:4000"
+)
+
+# Customize search context size
+response = client.chat.completions.create(
+    model="gpt-4o-search-preview",
+    messages=[
+        {
+            "role": "user",
+            "content": "What was a positive news story from today?"
+        }
+    ],
+    web_search_options={
+        "search_context_size": "low"  # Options: "low", "medium" (default), "high"
+    }
+)
+```
+</TabItem>
+</Tabs>
+
+## `/responses` (litellm.responses)
+
+### Quick Start
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python showLineNumbers
+from litellm import responses
+
+response = responses(
+    model="openai/gpt-4o",
+    input=[
+        {
+            "role": "user",
+            "content": "What was a positive news story from today?"
+        }
+    ],
+    tools=[{
+        "type": "web_search_preview"  # enables web search with default medium context size
+    }]
+)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: gpt-4o
+    litellm_params:
+      model: openai/gpt-4o
+      api_key: os.environ/OPENAI_API_KEY
+```
+
+2. Start the proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```python showLineNumbers
+from openai import OpenAI
+
+# Point to your proxy server
+client = OpenAI(
+    api_key="sk-1234",
+    base_url="http://0.0.0.0:4000"
+)
+
+response = client.responses.create(
+    model="gpt-4o",
+    tools=[{
+        "type": "web_search_preview"
+    }],
+    input="What was a positive news story from today?",
+)
+
+print(response.output_text)
+```
+</TabItem>
+</Tabs>
+
+### Search context size
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python showLineNumbers
+from litellm import responses
+
+# Customize search context size
+response = responses(
+    model="openai/gpt-4o",
+    input=[
+        {
+            "role": "user",
+            "content": "What was a positive news story from today?"
+        }
+    ],
+    tools=[{
+        "type": "web_search_preview",
+        "search_context_size": "low"  # Options: "low", "medium" (default), "high"
+    }]
+)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```python showLineNumbers
+from openai import OpenAI
+
+# Point to your proxy server
+client = OpenAI(
+    api_key="sk-1234",
+    base_url="http://0.0.0.0:4000"
+)
+
+# Customize search context size
+response = client.responses.create(
+    model="gpt-4o",
+    tools=[{
+        "type": "web_search_preview",
+        "search_context_size": "low"  # Options: "low", "medium" (default), "high"
+    }],
+    input="What was a positive news story from today?",
+)
+
+print(response.output_text)
+```
+</TabItem>
+</Tabs>
+
+
+
+
+
+
+## Checking if a model supports web search
+
+<Tabs>
+<TabItem label="SDK" value="sdk">
+
+Use `litellm.supports_web_search(model="openai/gpt-4o-search-preview")` -> returns `True` if model can perform web searches
+
+```python showLineNumbers
+assert litellm.supports_web_search(model="openai/gpt-4o-search-preview") == True
+```
+</TabItem>
+
+<TabItem label="PROXY" value="proxy">
+
+1. Define OpenAI models in config.yaml
+
+```yaml
+model_list:
+  - model_name: gpt-4o-search-preview
+    litellm_params:
+      model: openai/gpt-4o-search-preview
+      api_key: os.environ/OPENAI_API_KEY
+    model_info:
+      supports_web_search: True
+```
+
+2. Run proxy server
+
+```bash
+litellm --config config.yaml
+```
+
+3. Call `/model_group/info` to check if a model supports web search
+
+```shell
+curl -X 'GET' \
+  'http://localhost:4000/model_group/info' \
+  -H 'accept: application/json' \
+  -H 'x-api-key: sk-1234'
+```
+
+Expected Response 
+
+```json showLineNumbers
+{
+  "data": [
+    {
+      "model_group": "gpt-4o-search-preview",
+      "providers": ["openai"],
+      "max_tokens": 128000,
+      "supports_web_search": true, # 👈 supports_web_search is true
+    }
+  ]
+}
+```
+
+</TabItem>
+</Tabs>
--- a/docs/my-website/docs/guides/security_settings.md
+++ b/docs/my-website/docs/guides/security_settings.md
@ -0,0 +1,66 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# SSL Security Settings
+
+If you're in an environment using an older TTS bundle, with an older encryption, follow this guide. 
+
+
+LiteLLM uses HTTPX for network requests, unless otherwise specified. 
+
+1. Disable SSL verification
+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import litellm
+litellm.ssl_verify = False
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```yaml
+litellm_settings:
+  ssl_verify: false
+```
+
+</TabItem>  
+<TabItem value="env_var" label="Environment Variables">
+
+```bash
+export SSL_VERIFY="False"
+```
+</TabItem>
+</Tabs>
+
+2. Lower security settings
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import litellm
+litellm.ssl_security_level = 1
+litellm.ssl_certificate = "/path/to/certificate.pem"
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```yaml
+litellm_settings:
+  ssl_security_level: 1
+  ssl_certificate: "/path/to/certificate.pem"
+```
+</TabItem>
+<TabItem value="env_var" label="Environment Variables">
+
+```bash
+export SSL_SECURITY_LEVEL="1"
+export SSL_CERTIFICATE="/path/to/certificate.pem"
+```
+</TabItem>
+</Tabs>
+
+
--- a/docs/my-website/docs/observability/arize_integration.md
+++ b/docs/my-website/docs/observability/arize_integration.md
@ -1,4 +1,7 @@
+
 import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';

 # Arize AI

@ -11,6 +14,8 @@ https://github.com/BerriAI/litellm

 :::

+<Image img={require('../../img/arize.png')} />
+


 ## Pre-Requisites
@ -24,7 +29,9 @@ You can also use the instrumentor option instead of the callback, which you can
 ```python
 litellm.callbacks = ["arize"]
 ```
+
 ```python
+
 import litellm
 import os

@ -48,7 +55,7 @@ response = litellm.completion(

 ### Using with LiteLLM Proxy

-
+1. Setup config.yaml
 ```yaml
 model_list:
  - model_name: gpt-4
@ -60,13 +67,134 @@ model_list:
 litellm_settings:
  callbacks: ["arize"]

+general_settings:
+  master_key: "sk-1234" # can also be set as an environment variable
+
 environment_variables:
    ARIZE_SPACE_KEY: "d0*****"
    ARIZE_API_KEY: "141a****"
    ARIZE_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize GRPC api endpoint
-    ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT
+    ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT or Neither (defaults to https://otlp.arize.com/v1 on grpc)
 ```

+2. Start the proxy
+
+```bash
+litellm --config config.yaml
+```
+
+3. Test it!
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{ "model": "gpt-4", "messages": [{"role": "user", "content": "Hi 👋 - i'm openai"}]}'
+```
+
+## Pass Arize Space/Key per-request
+
+Supported parameters:
+- `arize_api_key`
+- `arize_space_key`
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import litellm
+import os
+
+# LLM API Keys
+os.environ['OPENAI_API_KEY']=""
+
+# set arize as a callback, litellm will send the data to arize
+litellm.callbacks = ["arize"]
+ 
+# openai call
+response = litellm.completion(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "user", "content": "Hi 👋 - i'm openai"}
+  ],
+  arize_api_key=os.getenv("ARIZE_SPACE_2_API_KEY"),
+  arize_space_key=os.getenv("ARIZE_SPACE_2_KEY"),
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+```yaml
+model_list:
+  - model_name: gpt-4
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
+litellm_settings:
+  callbacks: ["arize"]
+
+general_settings:
+  master_key: "sk-1234" # can also be set as an environment variable
+```
+
+2. Start the proxy
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it!
+
+<Tabs>
+<TabItem value="curl" label="CURL">
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "gpt-4",
+  "messages": [{"role": "user", "content": "Hi 👋 - i'm openai"}],
+  "arize_api_key": "ARIZE_SPACE_2_API_KEY",
+  "arize_space_key": "ARIZE_SPACE_2_KEY"
+}'
+```
+</TabItem>
+<TabItem value="openai_python" label="OpenAI Python">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages = [
+        {
+            "role": "user",
+            "content": "this is a test request, write a short poem"
+        }
+    ],
+    extra_body={
+      "arize_api_key": "ARIZE_SPACE_2_API_KEY",
+      "arize_space_key": "ARIZE_SPACE_2_KEY"
+    }
+)
+
+print(response)
+```
+</TabItem>
+</Tabs>
+</TabItem>
+</Tabs>
+
 ## Support & Talk to Founders

 - [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
--- a/docs/my-website/docs/providers/azure.md
+++ b/docs/my-website/docs/providers/azure.md
@ -291,14 +291,15 @@ response = completion(
 )
 ```

-## Azure O1 Models
+## O-Series Models

-| Model Name          | Function Call                                      |
-|---------------------|----------------------------------------------------|
-| o1-mini | `response = completion(model="azure/<your deployment name>", messages=messages)` |
-| o1-preview | `response = completion(model="azure/<your deployment name>", messages=messages)` |
+Azure OpenAI O-Series models are supported on LiteLLM. 

-Set `litellm.enable_preview_features = True` to use Azure O1 Models with streaming support. 
+LiteLLM routes any deployment name with `o1` or `o3` in the model name, to the O-Series [transformation](https://github.com/BerriAI/litellm/blob/91ed05df2962b8eee8492374b048d27cc144d08c/litellm/llms/azure/chat/o1_transformation.py#L4) logic.
+
+To set this explicitly, set `model` to `azure/o_series/<your-deployment-name>`.
+
+**Automatic Routing**

 <Tabs>
 <TabItem value="sdk" label="SDK">
@ -306,60 +307,112 @@ Set `litellm.enable_preview_features = True` to use Azure O1 Models with streami
 ```python
 import litellm

-litellm.enable_preview_features = True # 👈 KEY CHANGE
-
-response = litellm.completion(
-    model="azure/<your deployment name>",
-    messages=[{"role": "user", "content": "What is the weather like in Boston?"}],
-    stream=True
-)
-
-for chunk in response:
-    print(chunk)
+litellm.completion(model="azure/my-o3-deployment", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o3' in the deployment name
 ```
 </TabItem>
-<TabItem value="proxy" label="Proxy">
+<TabItem value="proxy" label="PROXY">

-1. Setup config.yaml
 ```yaml
 model_list:
-  - model_name: o1-mini
+  - model_name: o3-mini
    litellm_params:
-      model: azure/o1-mini
-      api_base: "os.environ/AZURE_API_BASE"
-      api_key: "os.environ/AZURE_API_KEY"
-      api_version: "os.environ/AZURE_API_VERSION"
-
-litellm_settings:
-    enable_preview_features: true # 👈 KEY CHANGE
+      model: azure/o3-model
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
 ```

-2. Start proxy 
+</TabItem>
+</Tabs>
+
+**Explicit Routing**
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import litellm
+
+litellm.completion(model="azure/o_series/my-random-deployment-name", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o_series/' in the deployment name
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```yaml
+model_list:
+  - model_name: o3-mini
+    litellm_params:
+      model: azure/o_series/my-random-deployment-name
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+```
+</TabItem>
+</Tabs>
+
+
+## Azure Audio Model
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+os.environ["AZURE_API_KEY"] = ""
+os.environ["AZURE_API_BASE"] = ""
+os.environ["AZURE_API_VERSION"] = ""
+
+response = completion(
+    model="azure/azure-openai-4o-audio",
+    messages=[
+      {
+        "role": "user",
+        "content": "I want to try out speech to speech"
+      }
+    ],
+    modalities=["text","audio"],
+    audio={"voice": "alloy", "format": "wav"}
+)
+
+print(response)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: azure-openai-4o-audio
+    litellm_params:
+      model: azure/azure-openai-4o-audio
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: os.environ/AZURE_API_VERSION
+```
+
+2. Start proxy

 ```bash
 litellm --config /path/to/config.yaml
 ```

-3. Test it 
+3. Test it!

-```python
-import openai
-client = openai.OpenAI(
-    api_key="anything",
-    base_url="http://0.0.0.0:4000"
-)

-response = client.chat.completions.create(model="o1-mini", messages = [
-    {
-        "role": "user",
-        "content": "this is a test request, write a short poem"
-    }
-],
-stream=True)
-
-for chunk in response:
-    print(chunk)
+```bash
+curl http://localhost:4000/v1/chat/completions \
+  -H "Authorization: Bearer $LITELLM_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "azure-openai-4o-audio",
+    "messages": [{"role": "user", "content": "I want to try out speech to speech"}],
+    "modalities": ["text","audio"],
+    "audio": {"voice": "alloy", "format": "wav"}
+  }'
 ```
+
+
 </TabItem>
 </Tabs>

@ -948,62 +1001,9 @@ Expected Response:
 {"data":[{"id":"batch_R3V...}
 ```

-## O-Series Models

-Azure OpenAI O-Series models are supported on LiteLLM. 

-LiteLLM routes any deployment name with `o1` or `o3` in the model name, to the O-Series [transformation](https://github.com/BerriAI/litellm/blob/91ed05df2962b8eee8492374b048d27cc144d08c/litellm/llms/azure/chat/o1_transformation.py#L4) logic.

-To set this explicitly, set `model` to `azure/o_series/<your-deployment-name>`.
-
-**Automatic Routing**
-
-<Tabs>
-<TabItem value="sdk" label="SDK">
-
-```python
-import litellm
-
-litellm.completion(model="azure/my-o3-deployment", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o3' in the deployment name
-```
-</TabItem>
-<TabItem value="proxy" label="PROXY">
-
-```yaml
-model_list:
-  - model_name: o3-mini
-    litellm_params:
-      model: azure/o3-model
-      api_base: os.environ/AZURE_API_BASE
-      api_key: os.environ/AZURE_API_KEY
-```
-
-</TabItem>
-</Tabs>
-
-**Explicit Routing**
-
-<Tabs>
-<TabItem value="sdk" label="SDK">
-
-```python
-import litellm
-
-litellm.completion(model="azure/o_series/my-random-deployment-name", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o_series/' in the deployment name
-```
-</TabItem>
-<TabItem value="proxy" label="PROXY">
-
-```yaml
-model_list:
-  - model_name: o3-mini
-    litellm_params:
-      model: azure/o_series/my-random-deployment-name
-      api_base: os.environ/AZURE_API_BASE
-      api_key: os.environ/AZURE_API_KEY
-```
-</TabItem>
-</Tabs>



--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@ -1428,10 +1428,14 @@ response = litellm.embedding(


 ## Supported AWS Bedrock Models
+
+LiteLLM supports ALL Bedrock models. 
+
 Here's an example of using a bedrock model with LiteLLM. For a complete list, refer to the [model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)

 | Model Name                 | Command                                                          |
 |----------------------------|------------------------------------------------------------------|
+| Deepseek R1    | `completion(model='bedrock/us.deepseek.r1-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-V3.5 Sonnet    | `completion(model='bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-V3  sonnet    | `completion(model='bedrock/anthropic.claude-3-sonnet-20240229-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-V3 Haiku     | `completion(model='bedrock/anthropic.claude-3-haiku-20240307-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
--- a/docs/my-website/docs/providers/ollama.md
+++ b/docs/my-website/docs/providers/ollama.md
@ -202,6 +202,67 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 </TabItem>
 </Tabs>

+
+## Using Ollama FIM on `/v1/completions`
+
+LiteLLM supports calling Ollama's `/api/generate` endpoint on `/v1/completions` requests. 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import litellm 
+litellm._turn_on_debug() # turn on debug to see the request
+from litellm import completion
+
+response = completion(
+    model="ollama/llama3.1",
+    prompt="Hello, world!",
+    api_base="http://localhost:11434"
+)
+print(response)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml 
+
+```yaml
+model_list:
+  - model_name: "llama3.1"             
+    litellm_params:
+      model: "ollama/llama3.1"
+      api_base: "http://localhost:11434"
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml --detailed_debug
+
+# RUNNING ON http://0.0.0.0:4000 
+```
+
+3. Test it! 
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="anything", # 👈 PROXY KEY (can be anything, if master_key not set)
+    base_url="http://0.0.0.0:4000" # 👈 PROXY BASE URL
+)
+
+response = client.completions.create(
+    model="ollama/llama3.1",
+    prompt="Hello, world!",
+    api_base="http://localhost:11434"
+)
+print(response)
+```
+</TabItem>
+</Tabs>
+
 ## Using ollama `api/chat` 
 In order to send ollama requests to `POST /api/chat` on your ollama server, set the model prefix to `ollama_chat`

--- a/docs/my-website/docs/providers/openai.md
+++ b/docs/my-website/docs/providers/openai.md
@ -228,6 +228,92 @@ response = completion(

 ```

+## PDF File Parsing
+
+OpenAI has a new `file` message type that allows you to pass in a PDF file and have it parsed into a structured output. [Read more](https://platform.openai.com/docs/guides/pdf-files?api-mode=chat&lang=python)
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import base64
+from litellm import completion
+
+with open("draconomicon.pdf", "rb") as f:
+    data = f.read()
+
+base64_string = base64.b64encode(data).decode("utf-8")
+
+completion = completion(
+    model="gpt-4o",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "file",
+                    "file": {
+                        "filename": "draconomicon.pdf",
+                        "file_data": f"data:application/pdf;base64,{base64_string}",
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "What is the first dragon in the book?",
+                }
+            ],
+        },
+    ],
+)
+
+print(completion.choices[0].message.content)
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: openai-model
+    litellm_params:
+      model: gpt-4o
+      api_key: os.environ/OPENAI_API_KEY
+```
+
+2. Start the proxy
+
+```bash
+litellm --config config.yaml
+```
+
+3. Test it!
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{ 
+    "model": "openai-model",
+    "messages": [
+        {"role": "user", "content": [
+            {
+                "type": "file",
+                "file": {
+                    "filename": "draconomicon.pdf",
+                    "file_data": f"data:application/pdf;base64,{base64_string}",
+                }
+            }
+        ]}
+    ]
+}'
+```
+
+</TabItem>
+</Tabs>
+
 ## OpenAI Fine Tuned Models

 | Model Name                | Function Call                                                          |
@ -449,26 +535,6 @@ response = litellm.acompletion(
 )
 ```

-### Using Helicone Proxy with LiteLLM
-```python
-import os 
-import litellm
-from litellm import completion
-
-os.environ["OPENAI_API_KEY"] = ""
-
-# os.environ["OPENAI_API_BASE"] = ""
-litellm.api_base = "https://oai.hconeai.com/v1"
-litellm.headers = {
-    "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}",
-    "Helicone-Cache-Enabled": "true",
-}
-
-messages = [{ "content": "Hello, how are you?","role": "user"}]
-
-# openai call
-response = completion("gpt-3.5-turbo", messages)
-```

 ### Using OpenAI Proxy with LiteLLM
 ```python
--- a/docs/my-website/docs/providers/openrouter.md
+++ b/docs/my-website/docs/providers/openrouter.md
@ -10,9 +10,11 @@ LiteLLM supports all the text / chat / vision models from [OpenRouter](https://o
 import os
 from litellm import completion
 os.environ["OPENROUTER_API_KEY"] = ""
+os.environ["OPENROUTER_API_BASE"] = "" # [OPTIONAL] defaults to https://openrouter.ai/api/v1

-os.environ["OR_SITE_URL"] = "" # optional
-os.environ["OR_APP_NAME"] = "" # optional
+
+os.environ["OR_SITE_URL"] = "" # [OPTIONAL]
+os.environ["OR_APP_NAME"] = "" # [OPTIONAL]

 response = completion(
            model="openrouter/google/palm-2-chat-bison",
--- a/docs/my-website/docs/proxy/call_hooks.md
+++ b/docs/my-website/docs/proxy/call_hooks.md
@ -70,6 +70,21 @@ class MyCustomHandler(CustomLogger): # https://docs.litellm.ai/docs/observabilit
        response: str,
    ):
        pass
+
+    aasync def async_post_call_streaming_iterator_hook(
+        self,
+        user_api_key_dict: UserAPIKeyAuth,
+        response: Any,
+        request_data: dict,
+    ) -> AsyncGenerator[ModelResponseStream, None]:
+        """
+        Passes the entire stream to the guardrail
+
+        This is useful for plugins that need to see the entire stream.
+        """
+        async for item in response:
+            yield item
+
 proxy_handler_instance = MyCustomHandler()
 ```

--- a/docs/my-website/docs/proxy/config_settings.md
+++ b/docs/my-website/docs/proxy/config_settings.md
@ -147,6 +147,7 @@ general_settings:
 |------|------|-------------|
 | completion_model | string | The default model to use for completions when `model` is not specified in the request |
 | disable_spend_logs | boolean | If true, turns off writing each transaction to the database |
+| disable_spend_updates | boolean | If true, turns off all spend updates to the DB. Including key/user/team spend updates. |
 | disable_master_key_return | boolean | If true, turns off returning master key on UI. (checked on '/user/info' endpoint) |
 | disable_retry_on_max_parallel_request_limit_error | boolean | If true, turns off retries when max parallel request limit is reached |
 | disable_reset_budget | boolean | If true, turns off reset budget scheduled task |
--- a/docs/my-website/docs/proxy/guardrails/custom_guardrail.md
+++ b/docs/my-website/docs/proxy/guardrails/custom_guardrail.md
@ -10,10 +10,12 @@ Use this is you want to write code to run a custom guardrail

 ### 1. Write a `CustomGuardrail` Class

-A CustomGuardrail has 3 methods to enforce guardrails 
+A CustomGuardrail has 4 methods to enforce guardrails 
 - `async_pre_call_hook` - (Optional) modify input or reject request before making LLM API call
 - `async_moderation_hook` - (Optional) reject request, runs while making LLM API call (help to lower latency)
 - `async_post_call_success_hook`- (Optional) apply guardrail on input/output, runs after making LLM API call
+- `async_post_call_streaming_iterator_hook` - (Optional) pass the entire stream to the guardrail
+

 **[See detailed spec of methods here](#customguardrail-methods)**

@ -128,6 +130,23 @@ class myCustomGuardrail(CustomGuardrail):
                    ):
                        raise ValueError("Guardrail failed Coffee Detected")

+    async def async_post_call_streaming_iterator_hook(
+        self,
+        user_api_key_dict: UserAPIKeyAuth,
+        response: Any,
+        request_data: dict,
+    ) -> AsyncGenerator[ModelResponseStream, None]:
+        """
+        Passes the entire stream to the guardrail
+
+        This is useful for guardrails that need to see the entire response, such as PII masking.
+
+        See Aim guardrail implementation for an example - https://github.com/BerriAI/litellm/blob/d0e022cfacb8e9ebc5409bb652059b6fd97b45c0/litellm/proxy/guardrails/guardrail_hooks/aim.py#L168
+
+        Triggered by mode: 'post_call'
+        """
+        async for item in response:
+            yield item

 ```

--- a/docs/my-website/docs/proxy/logging_spec.md
+++ b/docs/my-website/docs/proxy/logging_spec.md
@ -79,6 +79,7 @@ Inherits from `StandardLoggingUserAPIKeyMetadata` and adds:
 | `response_cost` | `Optional[str]` | Optional response cost |
 | `additional_headers` | `Optional[StandardLoggingAdditionalHeaders]` | Additional headers |
 | `batch_models` | `Optional[List[str]]` | Only set for Batches API. Lists the models used for cost calculation |
+| `litellm_model_name` | `Optional[str]` | Model name sent in request |

 ## StandardLoggingModelInformation

--- a/docs/my-website/docs/proxy/response_headers.md
+++ b/docs/my-website/docs/proxy/response_headers.md
@ -43,19 +43,19 @@ These headers are useful for clients to understand the current rate limit status
 | `x-litellm-max-fallbacks` | int | Maximum number of fallback attempts allowed |

 ## Cost Tracking Headers
-| Header | Type | Description |
-|--------|------|-------------|
-| `x-litellm-response-cost` | float | Cost of the API call |
-| `x-litellm-key-spend` | float | Total spend for the API key |
+| Header | Type | Description | Available on Pass-Through Endpoints |
+|--------|------|-------------|-------------|
+| `x-litellm-response-cost` | float | Cost of the API call | |
+| `x-litellm-key-spend` | float | Total spend for the API key | ✅ |

 ## LiteLLM Specific Headers
-| Header | Type | Description |
-|--------|------|-------------|
-| `x-litellm-call-id` | string | Unique identifier for the API call |
-| `x-litellm-model-id` | string | Unique identifier for the model used |
-| `x-litellm-model-api-base` | string | Base URL of the API endpoint |
-| `x-litellm-version` | string | Version of LiteLLM being used |
-| `x-litellm-model-group` | string | Model group identifier |
+| Header | Type | Description | Available on Pass-Through Endpoints |
+|--------|------|-------------|-------------|
+| `x-litellm-call-id` | string | Unique identifier for the API call | ✅ |
+| `x-litellm-model-id` | string | Unique identifier for the model used | |
+| `x-litellm-model-api-base` | string | Base URL of the API endpoint | ✅ |
+| `x-litellm-version` | string | Version of LiteLLM being used | |
+| `x-litellm-model-group` | string | Model group identifier | |

 ## Response headers from LLM providers

--- a/docs/my-website/img/arize.png
+++ b/docs/my-website/img/arize.png
--- a/docs/my-website/release_notes/v1.63.11-stable/index.md
+++ b/docs/my-website/release_notes/v1.63.11-stable/index.md
@ -26,14 +26,6 @@ This release is primarily focused on:
 - UI - Credential Management, re-use credentials when adding new models
 - UI - Test Connection to LLM Provider before adding a model

-:::info
-
-This release will be live on 03/16/2025
-
-:::
-
-<!-- <Image img={require('../../img/release_notes/v16311_release.jpg')} /> -->
-
 ## Known Issues
 - 🚨 Known issue on Azure OpenAI - We don't recommend upgrading if you use Azure OpenAI. This version failed our Azure OpenAI load test

--- a/docs/my-website/release_notes/v1.63.14/index.md
+++ b/docs/my-website/release_notes/v1.63.14/index.md
@ -0,0 +1,130 @@
+---
+title: v1.63.14-stable
+slug: v1.63.14-stable
+date: 2025-03-22T10:00:00
+authors:
+  - name: Krrish Dholakia
+    title: CEO, LiteLLM
+    url: https://www.linkedin.com/in/krish-d/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
+  - name: Ishaan Jaffer
+    title: CTO, LiteLLM
+    url: https://www.linkedin.com/in/reffajnaahsi/
+    image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg
+
+tags: [credential management, thinking content, responses api, snowflake]
+hide_table_of_contents: false
+---
+
+import Image from '@theme/IdealImage';
+
+These are the changes since `v1.63.11-stable`.
+
+This release brings:
+- LLM Translation Improvements (MCP Support and Bedrock Application Profiles)
+- Perf improvements for Usage-based Routing
+- Streaming guardrail support via websockets
+
+## Docker Run LiteLLM Proxy
+
+```
+docker run
+-e STORE_MODEL_IN_DB=True
+-p 4000:4000
+ghcr.io/berriai/litellm:main-v1.63.14-stable
+```
+
+## Demo Instance
+
+Here's a Demo Instance to test changes:
+- Instance: https://demo.litellm.ai/
+- Login Credentials:
+    - Username: admin
+    - Password: sk-1234
+
+
+
+## New Models / Updated Models
+
+- Azure gpt-4o - fixed pricing to latest global pricing - [PR](https://github.com/BerriAI/litellm/pull/9361)
+- O1-Pro - add pricing + model information - [PR](https://github.com/BerriAI/litellm/pull/9397)
+- Azure AI - mistral 3.1 small pricing added - [PR](https://github.com/BerriAI/litellm/pull/9453)
+- Azure - gpt-4.5-preview pricing added - [PR](https://github.com/BerriAI/litellm/pull/9453)
+
+
+
+## LLM Translation
+
+1. **New LLM Features**
+
+- Bedrock: Support bedrock application inference profiles [Docs](https://docs.litellm.ai/docs/providers/bedrock#bedrock-application-inference-profile)
+   - Infer aws region from bedrock application profile id - (`arn:aws:bedrock:us-east-1:...`)
+- Ollama - support calling via `/v1/completions` [Get Started](../../docs/providers/ollama#using-ollama-fim-on-v1completions)
+- Bedrock - support `us.deepseek.r1-v1:0` model name [Docs](../../docs/providers/bedrock#supported-aws-bedrock-models)
+- OpenRouter - `OPENROUTER_API_BASE` env var support [Docs](../../docs/providers/openrouter.md)
+- Azure - add audio model parameter support - [Docs](../../docs/providers/azure#azure-audio-model)
+- OpenAI - PDF File support [Docs](../../docs/completion/document_understanding#openai-file-message-type)
+- OpenAI - o1-pro Responses API streaming support [Docs](../../docs/response_api.md#streaming)
+- [BETA] MCP - Use MCP Tools with LiteLLM SDK [Docs](../../docs/mcp)
+
+2. **Bug Fixes**
+
+- Voyage: prompt token on embedding tracking fix - [PR](https://github.com/BerriAI/litellm/commit/56d3e75b330c3c3862dc6e1c51c1210e48f1068e)
+- Sagemaker - Fix ‘Too little data for declared Content-Length’ error - [PR](https://github.com/BerriAI/litellm/pull/9326)
+- OpenAI-compatible models - fix issue when calling openai-compatible models w/ custom_llm_provider set - [PR](https://github.com/BerriAI/litellm/pull/9355)
+- VertexAI - Embedding ‘outputDimensionality’ support - [PR](https://github.com/BerriAI/litellm/commit/437dbe724620675295f298164a076cbd8019d304)
+- Anthropic - return consistent json response format on streaming/non-streaming - [PR](https://github.com/BerriAI/litellm/pull/9437)
+
+## Spend Tracking Improvements
+
+- `litellm_proxy/` - support reading litellm response cost header from proxy, when using client sdk 
+- Reset Budget Job - fix budget reset error on keys/teams/users [PR](https://github.com/BerriAI/litellm/pull/9329)
+- Streaming - Prevents final chunk w/ usage from being ignored (impacted bedrock streaming + cost tracking) [PR](https://github.com/BerriAI/litellm/pull/9314)
+
+
+## UI
+
+1. Users Page
+   - Feature: Control default internal user settings [PR](https://github.com/BerriAI/litellm/pull/9328)
+2. Icons:
+   - Feature: Replace external "artificialanalysis.ai" icons by local svg [PR](https://github.com/BerriAI/litellm/pull/9374)
+3. Sign In/Sign Out
+   - Fix: Default login when `default_user_id` user does not exist in DB [PR](https://github.com/BerriAI/litellm/pull/9395)
+
+
+## Logging Integrations
+
+- Support post-call guardrails for streaming responses [Get Started](../../docs/proxy/guardrails/custom_guardrail#1-write-a-customguardrail-class)
+- Arize [Get Started](../../docs/observability/arize_integration)
+   - fix invalid package import [PR](https://github.com/BerriAI/litellm/pull/9338)
+   - migrate to using standardloggingpayload for metadata, ensures spans land successfully [PR](https://github.com/BerriAI/litellm/pull/9338)
+   - fix logging to just log the LLM I/O [PR](https://github.com/BerriAI/litellm/pull/9353)
+   - Dynamic API Key/Space param support [Get Started](../../docs/observability/arize_integration#pass-arize-spacekey-per-request)
+- StandardLoggingPayload - Log litellm_model_name in payload. Allows knowing what the model sent to API provider was [Get Started](../../docs/proxy/logging_spec#standardlogginghiddenparams)
+- Prompt Management - Allow building custom prompt management integration [Get Started](../../docs/proxy/custom_prompt_management.md)
+
+## Performance / Reliability improvements
+
+- Redis Caching - add 5s default timeout, prevents hanging redis connection from impacting llm calls [PR](https://github.com/BerriAI/litellm/commit/db92956ae33ed4c4e3233d7e1b0c7229817159bf)
+- Allow disabling all spend updates / writes to DB - patch to allow disabling all spend updates to DB with a flag [PR](https://github.com/BerriAI/litellm/pull/9331)
+- Azure OpenAI - correctly re-use azure openai client, fixes perf issue from previous Stable release [PR](https://github.com/BerriAI/litellm/commit/f2026ef907c06d94440930917add71314b901413)
+- Azure OpenAI - uses litellm.ssl_verify on Azure/OpenAI clients [PR](https://github.com/BerriAI/litellm/commit/f2026ef907c06d94440930917add71314b901413)
+- Usage-based routing - Wildcard model support [Get Started](../../docs/proxy/usage_based_routing#wildcard-model-support)
+- Usage-based routing - Support batch writing increments to redis - reduces latency to same as ‘simple-shuffle’ [PR](https://github.com/BerriAI/litellm/pull/9357)
+- Router - show reason for model cooldown on ‘no healthy deployments available error’ [PR](https://github.com/BerriAI/litellm/pull/9438)
+- Caching - add max value limit to an item in in-memory cache (1MB) - prevents OOM errors on large image url’s being sent through proxy [PR](https://github.com/BerriAI/litellm/pull/9448)
+
+
+## General Improvements
+
+- Passthrough Endpoints - support returning api-base on pass-through endpoints Response Headers [Docs](../../docs/proxy/response_headers#litellm-specific-headers)
+- SSL - support reading ssl security level from env var - Allows user to specify lower security settings [Get Started](../../docs/guides/security_settings)
+- Credentials - only poll Credentials table when `STORE_MODEL_IN_DB` is True [PR](https://github.com/BerriAI/litellm/pull/9376)
+- Image URL Handling - new architecture doc on image url handling [Docs](../../docs/proxy/image_handling)
+- OpenAI - bump to pip install "openai==1.68.2" [PR](https://github.com/BerriAI/litellm/commit/e85e3bc52a9de86ad85c3dbb12d87664ee567a5a)
+- Gunicorn - security fix - bump gunicorn==23.0.0 [PR](https://github.com/BerriAI/litellm/commit/7e9fc92f5c7fea1e7294171cd3859d55384166eb)
+
+
+## Complete Git Diff
+
+[Here's the complete git diff](https://github.com/BerriAI/litellm/compare/v1.63.11-stable...v1.63.14.rc)
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -243,7 +243,9 @@ const sidebars = {
        "exception_mapping",
        "completion/provider_specific_params",
        "guides/finetuned_models",
+        "guides/security_settings",
        "completion/audio",
+        "completion/web_search",
        "completion/document_understanding",
        "completion/vision",
        "completion/json_mode",
--- a/litellm/init.py
+++ b/litellm/init.py
@ -122,6 +122,9 @@ langsmith_batch_size: Optional[int] = None
 prometheus_initialize_budget_metrics: Optional[bool] = False
 argilla_batch_size: Optional[int] = None
 datadog_use_v1: Optional[bool] = False  # if you want to use v1 datadog logged payload
+gcs_pub_sub_use_v1: Optional[bool] = (
+    False  # if you want to use v1 gcs pubsub logged payload
+)
 argilla_transformation_object: Optional[Dict[str, Any]] = None
 _async_input_callback: List[Union[str, Callable, CustomLogger]] = (
    []
@ -756,6 +759,7 @@ from .utils import (
    create_pretrained_tokenizer,
    create_tokenizer,
    supports_function_calling,
+    supports_web_search,
    supports_response_schema,
    supports_parallel_function_calling,
    supports_vision,
--- a/litellm/caching/caching.py
+++ b/litellm/caching/caching.py
@ -88,16 +88,16 @@ class Cache:
        s3_aws_session_token: Optional[str] = None,
        s3_config: Optional[Any] = None,
        s3_path: Optional[str] = None,
-        redis_semantic_cache_use_async=False,
-        redis_semantic_cache_embedding_model="text-embedding-ada-002",
+        redis_semantic_cache_embedding_model: str = "text-embedding-ada-002",
+        redis_semantic_cache_index_name: Optional[str] = None,
        redis_flush_size: Optional[int] = None,
        redis_startup_nodes: Optional[List] = None,
-        disk_cache_dir=None,
+        disk_cache_dir: Optional[str] = None,
        qdrant_api_base: Optional[str] = None,
        qdrant_api_key: Optional[str] = None,
        qdrant_collection_name: Optional[str] = None,
        qdrant_quantization_config: Optional[str] = None,
-        qdrant_semantic_cache_embedding_model="text-embedding-ada-002",
+        qdrant_semantic_cache_embedding_model: str = "text-embedding-ada-002",
        **kwargs,
    ):
        """
@ -170,8 +170,8 @@ class Cache:
                port=port,
                password=password,
                similarity_threshold=similarity_threshold,
-                use_async=redis_semantic_cache_use_async,
                embedding_model=redis_semantic_cache_embedding_model,
+                index_name=redis_semantic_cache_index_name,
                **kwargs,
            )
        elif type == LiteLLMCacheType.QDRANT_SEMANTIC:
--- a/litellm/caching/redis_semantic_cache.py
+++ b/litellm/caching/redis_semantic_cache.py
@ -1,337 +1,437 @@
 """
-Redis Semantic Cache implementation
+Redis Semantic Cache implementation for LiteLLM

-Has 4 methods:
-    - set_cache
-    - get_cache
-    - async_set_cache
-    - async_get_cache
+The RedisSemanticCache provides semantic caching functionality using Redis as a backend.
+This cache stores responses based on the semantic similarity of prompts rather than
+exact matching, allowing for more flexible caching of LLM responses.
+
+This implementation uses RedisVL's SemanticCache to find semantically similar prompts
+and their cached responses.
 """

 import ast
 import asyncio
 import json
-from typing import Any
+import os
+from typing import Any, Dict, List, Optional, Tuple

 import litellm
 from litellm._logging import print_verbose
-
+from litellm.litellm_core_utils.prompt_templates.common_utils import get_str_from_messages
 from .base_cache import BaseCache


 class RedisSemanticCache(BaseCache):
+    """
+    Redis-backed semantic cache for LLM responses. 
+    
+    This cache uses vector similarity to find semantically similar prompts that have been 
+    previously sent to the LLM, allowing for cache hits even when prompts are not identical
+    but carry similar meaning.
+    """
+    
+    DEFAULT_REDIS_INDEX_NAME: str = "litellm_semantic_cache_index"
+
    def __init__(
        self,
-        host=None,
-        port=None,
-        password=None,
-        redis_url=None,
-        similarity_threshold=None,
-        use_async=False,
-        embedding_model="text-embedding-ada-002",
+        host: Optional[str] = None,
+        port: Optional[str] = None,
+        password: Optional[str] = None,
+        redis_url: Optional[str] = None,
+        similarity_threshold: Optional[float] = None,
+        embedding_model: str = "text-embedding-ada-002",
+        index_name: Optional[str] = None,
        **kwargs,
    ):
-        from redisvl.index import SearchIndex
-
-        print_verbose(
-            "redis semantic-cache initializing INDEX - litellm_semantic_cache_index"
-        )
-        if similarity_threshold is None:
-            raise Exception("similarity_threshold must be provided, passed None")
-        self.similarity_threshold = similarity_threshold
-        self.embedding_model = embedding_model
-        schema = {
-            "index": {
-                "name": "litellm_semantic_cache_index",
-                "prefix": "litellm",
-                "storage_type": "hash",
-            },
-            "fields": {
-                "text": [{"name": "response"}],
-                "vector": [
-                    {
-                        "name": "litellm_embedding",
-                        "dims": 1536,
-                        "distance_metric": "cosine",
-                        "algorithm": "flat",
-                        "datatype": "float32",
-                    }
-                ],
-            },
-        }
-        if redis_url is None:
-            # if no url passed, check if host, port and password are passed, if not raise an Exception
-            if host is None or port is None or password is None:
-                # try checking env for host, port and password
-                import os
-
-                host = os.getenv("REDIS_HOST")
-                port = os.getenv("REDIS_PORT")
-                password = os.getenv("REDIS_PASSWORD")
-                if host is None or port is None or password is None:
-                    raise Exception("Redis host, port, and password must be provided")
-
-            redis_url = "redis://:" + password + "@" + host + ":" + port
-        print_verbose(f"redis semantic-cache redis_url: {redis_url}")
-        if use_async is False:
-            self.index = SearchIndex.from_dict(schema)
-            self.index.connect(redis_url=redis_url)
-            try:
-                self.index.create(overwrite=False)  # don't overwrite existing index
-            except Exception as e:
-                print_verbose(f"Got exception creating semantic cache index: {str(e)}")
-        elif use_async is True:
-            schema["index"]["name"] = "litellm_semantic_cache_index_async"
-            self.index = SearchIndex.from_dict(schema)
-            self.index.connect(redis_url=redis_url, use_async=True)
-
-    #
-    def _get_cache_logic(self, cached_response: Any):
        """
-        Common 'get_cache_logic' across sync + async redis client implementations
+        Initialize the Redis Semantic Cache.
+
+        Args:
+            host: Redis host address
+            port: Redis port
+            password: Redis password
+            redis_url: Full Redis URL (alternative to separate host/port/password)
+            similarity_threshold: Threshold for semantic similarity (0.0 to 1.0)
+                where 1.0 requires exact matches and 0.0 accepts any match
+            embedding_model: Model to use for generating embeddings
+            index_name: Name for the Redis index
+            ttl: Default time-to-live for cache entries in seconds
+            **kwargs: Additional arguments passed to the Redis client
+        
+        Raises:
+            Exception: If similarity_threshold is not provided or required Redis
+                connection information is missing
+        """
+        from redisvl.extensions.llmcache import SemanticCache
+        from redisvl.utils.vectorize import CustomTextVectorizer
+
+        if index_name is None:
+            index_name = self.DEFAULT_REDIS_INDEX_NAME
+
+        print_verbose(f"Redis semantic-cache initializing index - {index_name}")
+        
+        # Validate similarity threshold
+        if similarity_threshold is None:
+            raise ValueError("similarity_threshold must be provided, passed None")
+            
+        # Store configuration
+        self.similarity_threshold = similarity_threshold
+        
+        # Convert similarity threshold [0,1] to distance threshold [0,2]
+        # For cosine distance: 0 = most similar, 2 = least similar
+        # While similarity: 1 = most similar, 0 = least similar
+        self.distance_threshold = 1 - similarity_threshold
+        self.embedding_model = embedding_model
+
+        # Set up Redis connection
+        if redis_url is None:
+            try:
+                # Attempt to use provided parameters or fallback to environment variables
+                host = host or os.environ['REDIS_HOST']
+                port = port or os.environ['REDIS_PORT']
+                password = password or os.environ['REDIS_PASSWORD']
+            except KeyError as e:
+                # Raise a more informative exception if any of the required keys are missing
+                missing_var = e.args[0]
+                raise ValueError(f"Missing required Redis configuration: {missing_var}. "
+                                 f"Provide {missing_var} or redis_url.") from e
+
+            redis_url = f"redis://:{password}@{host}:{port}"
+
+        print_verbose(f"Redis semantic-cache redis_url: {redis_url}")
+
+        # Initialize the Redis vectorizer and cache
+        cache_vectorizer = CustomTextVectorizer(self._get_embedding)
+
+        self.llmcache = SemanticCache(
+            name=index_name,
+            redis_url=redis_url,
+            vectorizer=cache_vectorizer,
+            distance_threshold=self.distance_threshold,
+            overwrite=False,
+        )
+
+    def _get_ttl(self, **kwargs) -> Optional[int]:
+        """
+        Get the TTL (time-to-live) value for cache entries.
+        
+        Args:
+            **kwargs: Keyword arguments that may contain a custom TTL
+
+        Returns:
+            Optional[int]: The TTL value in seconds, or None if no TTL should be applied
+        """
+        ttl = kwargs.get("ttl")
+        if ttl is not None:
+            ttl = int(ttl)
+        return ttl
+    
+    def _get_embedding(self, prompt: str) -> List[float]:
+        """
+        Generate an embedding vector for the given prompt using the configured embedding model.
+        
+        Args:
+            prompt: The text to generate an embedding for
+            
+        Returns:
+            List[float]: The embedding vector
+        """
+        # Create an embedding from prompt
+        embedding_response = litellm.embedding(
+            model=self.embedding_model,
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+        embedding = embedding_response["data"][0]["embedding"]
+        return embedding
+
+    def _get_cache_logic(self, cached_response: Any) -> Any:
+        """
+        Process the cached response to prepare it for use.
+        
+        Args:
+            cached_response: The raw cached response
+            
+        Returns:
+            The processed cache response, or None if input was None
        """
        if cached_response is None:
            return cached_response

-        # check if cached_response is bytes
+        # Convert bytes to string if needed
        if isinstance(cached_response, bytes):
            cached_response = cached_response.decode("utf-8")

+        # Convert string representation to Python object
        try:
-            cached_response = json.loads(
-                cached_response
-            )  # Convert string to dictionary
-        except Exception:
-            cached_response = ast.literal_eval(cached_response)
+            cached_response = json.loads(cached_response)
+        except json.JSONDecodeError:
+            try:
+                cached_response = ast.literal_eval(cached_response)
+            except (ValueError, SyntaxError) as e:
+                print_verbose(f"Error parsing cached response: {str(e)}")
+                return None
+                
        return cached_response

-    def set_cache(self, key, value, **kwargs):
-        import numpy as np
-
-        print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}")
-
-        # get the prompt
-        messages = kwargs["messages"]
-        prompt = "".join(message["content"] for message in messages)
-
-        # create an embedding for prompt
-        embedding_response = litellm.embedding(
-            model=self.embedding_model,
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
-        )
-
-        # get the embedding
-        embedding = embedding_response["data"][0]["embedding"]
-
-        # make the embedding a numpy array, convert to bytes
-        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
-        value = str(value)
-        assert isinstance(value, str)
-
-        new_data = [
-            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
-        ]
-
-        # Add more data
-        self.index.load(new_data)
-
-        return
-
-    def get_cache(self, key, **kwargs):
-        print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}")
-        from redisvl.query import VectorQuery
-
-        # query
-        # get the messages
-        messages = kwargs["messages"]
-        prompt = "".join(message["content"] for message in messages)
-
-        # convert to embedding
-        embedding_response = litellm.embedding(
-            model=self.embedding_model,
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
-        )
-
-        # get the embedding
-        embedding = embedding_response["data"][0]["embedding"]
-
-        query = VectorQuery(
-            vector=embedding,
-            vector_field_name="litellm_embedding",
-            return_fields=["response", "prompt", "vector_distance"],
-            num_results=1,
-        )
-
-        results = self.index.query(query)
-        if results is None:
-            return None
-        if isinstance(results, list):
-            if len(results) == 0:
-                return None
-
-        vector_distance = results[0]["vector_distance"]
-        vector_distance = float(vector_distance)
-        similarity = 1 - vector_distance
-        cached_prompt = results[0]["prompt"]
-
-        # check similarity, if more than self.similarity_threshold, return results
-        print_verbose(
-            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
-        )
-        if similarity > self.similarity_threshold:
-            # cache hit !
-            cached_value = results[0]["response"]
-            print_verbose(
-                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
-            )
-            return self._get_cache_logic(cached_response=cached_value)
-        else:
-            # cache miss !
-            return None
-
-        pass
-
-    async def async_set_cache(self, key, value, **kwargs):
-        import numpy as np
-
-        from litellm.proxy.proxy_server import llm_model_list, llm_router
+    def set_cache(self, key: str, value: Any, **kwargs) -> None:
+        """
+        Store a value in the semantic cache.
+        
+        Args:
+            key: The cache key (not directly used in semantic caching)
+            value: The response value to cache
+            **kwargs: Additional arguments including 'messages' for the prompt
+                and optional 'ttl' for time-to-live
+        """
+        print_verbose(f"Redis semantic-cache set_cache, kwargs: {kwargs}")

        try:
-            await self.index.acreate(overwrite=False)  # don't overwrite existing index
+            # Extract the prompt from messages
+            messages = kwargs.get("messages", [])
+            if not messages:
+                print_verbose("No messages provided for semantic caching")
+                return
+                
+            prompt = get_str_from_messages(messages)
+            value_str = str(value)
+
+            # Get TTL and store in Redis semantic cache
+            ttl = self._get_ttl(**kwargs)
+            if ttl is not None:
+                self.llmcache.store(prompt, value_str, ttl=int(ttl))
+            else:
+                self.llmcache.store(prompt, value_str)
        except Exception as e:
-            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
-        print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
+            print_verbose(f"Error setting {value_str} in the Redis semantic cache: {str(e)}")

-        # get the prompt
-        messages = kwargs["messages"]
-        prompt = "".join(message["content"] for message in messages)
-        # create an embedding for prompt
-        router_model_names = (
-            [m["model_name"] for m in llm_model_list]
-            if llm_model_list is not None
-            else []
-        )
-        if llm_router is not None and self.embedding_model in router_model_names:
-            user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
-            embedding_response = await llm_router.aembedding(
-                model=self.embedding_model,
-                input=prompt,
-                cache={"no-store": True, "no-cache": True},
-                metadata={
-                    "user_api_key": user_api_key,
-                    "semantic-cache-embedding": True,
-                    "trace_id": kwargs.get("metadata", {}).get("trace_id", None),
-                },
-            )
-        else:
-            # convert to embedding
-            embedding_response = await litellm.aembedding(
-                model=self.embedding_model,
-                input=prompt,
-                cache={"no-store": True, "no-cache": True},
-            )
+    def get_cache(self, key: str, **kwargs) -> Any:
+        """
+        Retrieve a semantically similar cached response.
+        
+        Args:
+            key: The cache key (not directly used in semantic caching)
+            **kwargs: Additional arguments including 'messages' for the prompt
+            
+        Returns:
+            The cached response if a semantically similar prompt is found, else None
+        """
+        print_verbose(f"Redis semantic-cache get_cache, kwargs: {kwargs}")

-        # get the embedding
-        embedding = embedding_response["data"][0]["embedding"]
+        try:
+            # Extract the prompt from messages
+            messages = kwargs.get("messages", [])
+            if not messages:
+                print_verbose("No messages provided for semantic cache lookup")
+                return None
+                
+            prompt = get_str_from_messages(messages)
+            # Check the cache for semantically similar prompts
+            results = self.llmcache.check(prompt=prompt)

-        # make the embedding a numpy array, convert to bytes
-        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
-        value = str(value)
-        assert isinstance(value, str)
-
-        new_data = [
-            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
-        ]
-
-        # Add more data
-        await self.index.aload(new_data)
-        return
-
-    async def async_get_cache(self, key, **kwargs):
-        print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
-        from redisvl.query import VectorQuery
-
-        from litellm.proxy.proxy_server import llm_model_list, llm_router
-
-        # query
-        # get the messages
-        messages = kwargs["messages"]
-        prompt = "".join(message["content"] for message in messages)
-
-        router_model_names = (
-            [m["model_name"] for m in llm_model_list]
-            if llm_model_list is not None
-            else []
-        )
-        if llm_router is not None and self.embedding_model in router_model_names:
-            user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
-            embedding_response = await llm_router.aembedding(
-                model=self.embedding_model,
-                input=prompt,
-                cache={"no-store": True, "no-cache": True},
-                metadata={
-                    "user_api_key": user_api_key,
-                    "semantic-cache-embedding": True,
-                    "trace_id": kwargs.get("metadata", {}).get("trace_id", None),
-                },
-            )
-        else:
-            # convert to embedding
-            embedding_response = await litellm.aembedding(
-                model=self.embedding_model,
-                input=prompt,
-                cache={"no-store": True, "no-cache": True},
-            )
-
-        # get the embedding
-        embedding = embedding_response["data"][0]["embedding"]
-
-        query = VectorQuery(
-            vector=embedding,
-            vector_field_name="litellm_embedding",
-            return_fields=["response", "prompt", "vector_distance"],
-        )
-        results = await self.index.aquery(query)
-        if results is None:
-            kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
-            return None
-        if isinstance(results, list):
-            if len(results) == 0:
-                kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
+            # Return None if no similar prompts found
+            if not results:
                return None

-        vector_distance = results[0]["vector_distance"]
-        vector_distance = float(vector_distance)
-        similarity = 1 - vector_distance
-        cached_prompt = results[0]["prompt"]
+            # Process the best matching result
+            cache_hit = results[0]
+            vector_distance = float(cache_hit["vector_distance"])
+            
+            # Convert vector distance back to similarity score
+            # For cosine distance: 0 = most similar, 2 = least similar
+            # While similarity: 1 = most similar, 0 = least similar
+            similarity = 1 - vector_distance
+            
+            cached_prompt = cache_hit["prompt"]
+            cached_response = cache_hit["response"]

-        # check similarity, if more than self.similarity_threshold, return results
-        print_verbose(
-            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
-        )
-
-        # update kwargs["metadata"] with similarity, don't rewrite the original metadata
-        kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
-
-        if similarity > self.similarity_threshold:
-            # cache hit !
-            cached_value = results[0]["response"]
            print_verbose(
-                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
+                f"Cache hit: similarity threshold: {self.similarity_threshold}, "
+                f"actual similarity: {similarity}, "
+                f"current prompt: {prompt}, "
+                f"cached prompt: {cached_prompt}"
            )
-            return self._get_cache_logic(cached_response=cached_value)
-        else:
-            # cache miss !
-            return None
-        pass
+            
+            return self._get_cache_logic(cached_response=cached_response)
+        except Exception as e:
+            print_verbose(f"Error retrieving from Redis semantic cache: {str(e)}")
+    
+    async def _get_async_embedding(self, prompt: str, **kwargs) -> List[float]:
+        """
+        Asynchronously generate an embedding for the given prompt.
+        
+        Args:
+            prompt: The text to generate an embedding for
+            **kwargs: Additional arguments that may contain metadata
+            
+        Returns:
+            List[float]: The embedding vector
+        """
+        from litellm.proxy.proxy_server import llm_model_list, llm_router

-    async def _index_info(self):
-        return await self.index.ainfo()
+        # Route the embedding request through the proxy if appropriate
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
+        )
+        
+        try:
+            if llm_router is not None and self.embedding_model in router_model_names:
+                # Use the router for embedding generation
+                user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
+                embedding_response = await llm_router.aembedding(
+                    model=self.embedding_model,
+                    input=prompt,
+                    cache={"no-store": True, "no-cache": True},
+                    metadata={
+                        "user_api_key": user_api_key,
+                        "semantic-cache-embedding": True,
+                        "trace_id": kwargs.get("metadata", {}).get("trace_id", None),
+                    },
+                )
+            else:
+                # Generate embedding directly
+                embedding_response = await litellm.aembedding(
+                    model=self.embedding_model,
+                    input=prompt,
+                    cache={"no-store": True, "no-cache": True},
+                )

-    async def async_set_cache_pipeline(self, cache_list, **kwargs):
-        tasks = []
-        for val in cache_list:
-            tasks.append(self.async_set_cache(val[0], val[1], **kwargs))
-        await asyncio.gather(*tasks)
+            # Extract and return the embedding vector
+            return embedding_response["data"][0]["embedding"]
+        except Exception as e:
+            print_verbose(f"Error generating async embedding: {str(e)}")
+            raise ValueError(f"Failed to generate embedding: {str(e)}") from e
+
+    async def async_set_cache(self, key: str, value: Any, **kwargs) -> None:
+        """
+        Asynchronously store a value in the semantic cache.
+        
+        Args:
+            key: The cache key (not directly used in semantic caching)
+            value: The response value to cache
+            **kwargs: Additional arguments including 'messages' for the prompt
+                and optional 'ttl' for time-to-live
+        """
+        print_verbose(f"Async Redis semantic-cache set_cache, kwargs: {kwargs}")
+
+        try:
+            # Extract the prompt from messages
+            messages = kwargs.get("messages", [])
+            if not messages:
+                print_verbose("No messages provided for semantic caching")
+                return
+                
+            prompt = get_str_from_messages(messages)
+            value_str = str(value)
+
+            # Generate embedding for the value (response) to cache
+            prompt_embedding = await self._get_async_embedding(prompt, **kwargs)
+            
+            # Get TTL and store in Redis semantic cache
+            ttl = self._get_ttl(**kwargs)
+            if ttl is not None:
+                await self.llmcache.astore(
+                    prompt,
+                    value_str,
+                    vector=prompt_embedding,  # Pass through custom embedding
+                    ttl=ttl
+                )
+            else:
+                await self.llmcache.astore(
+                    prompt,
+                    value_str,
+                    vector=prompt_embedding  # Pass through custom embedding
+                )
+        except Exception as e:
+            print_verbose(f"Error in async_set_cache: {str(e)}")
+
+    async def async_get_cache(self, key: str, **kwargs) -> Any:
+        """
+        Asynchronously retrieve a semantically similar cached response.
+        
+        Args:
+            key: The cache key (not directly used in semantic caching)
+            **kwargs: Additional arguments including 'messages' for the prompt
+            
+        Returns:
+            The cached response if a semantically similar prompt is found, else None
+        """
+        print_verbose(f"Async Redis semantic-cache get_cache, kwargs: {kwargs}")
+
+        try:
+            # Extract the prompt from messages
+            messages = kwargs.get("messages", [])
+            if not messages:
+                print_verbose("No messages provided for semantic cache lookup")
+                kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
+                return None
+                
+            prompt = get_str_from_messages(messages)
+            
+            # Generate embedding for the prompt
+            prompt_embedding = await self._get_async_embedding(prompt, **kwargs)
+
+            # Check the cache for semantically similar prompts
+            results = await self.llmcache.acheck(
+                prompt=prompt,
+                vector=prompt_embedding
+            )
+
+            # handle results / cache hit
+            if not results:
+                kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0 # TODO why here but not above??
+                return None
+
+            cache_hit = results[0]
+            vector_distance = float(cache_hit["vector_distance"])
+
+            # Convert vector distance back to similarity
+            # For cosine distance: 0 = most similar, 2 = least similar
+            # While similarity: 1 = most similar, 0 = least similar
+            similarity = 1 - vector_distance
+
+            cached_prompt = cache_hit["prompt"]
+            cached_response = cache_hit["response"]
+
+            # update kwargs["metadata"] with similarity, don't rewrite the original metadata
+            kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
+
+            print_verbose(
+                f"Cache hit: similarity threshold: {self.similarity_threshold}, "
+                f"actual similarity: {similarity}, "
+                f"current prompt: {prompt}, "
+                f"cached prompt: {cached_prompt}"
+            )
+            
+            return self._get_cache_logic(cached_response=cached_response)
+        except Exception as e:
+            print_verbose(f"Error in async_get_cache: {str(e)}")
+            kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
+
+    async def _index_info(self) -> Dict[str, Any]:
+        """
+        Get information about the Redis index.
+        
+        Returns:
+            Dict[str, Any]: Information about the Redis index
+        """
+        aindex = await self.llmcache._get_async_index()
+        return await aindex.info()
+
+    async def async_set_cache_pipeline(self, cache_list: List[Tuple[str, Any]], **kwargs) -> None:
+        """
+        Asynchronously store multiple values in the semantic cache.
+        
+        Args:
+            cache_list: List of (key, value) tuples to cache
+            **kwargs: Additional arguments
+        """
+        try:
+            tasks = []
+            for val in cache_list:
+                tasks.append(self.async_set_cache(val[0], val[1], **kwargs))
+            await asyncio.gather(*tasks)
+        except Exception as e:
+            print_verbose(f"Error in async_set_cache_pipeline: {str(e)}")
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -9,6 +9,9 @@ from pydantic import BaseModel
 import litellm
 import litellm._logging
 from litellm import verbose_logger
+from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
+    StandardBuiltInToolCostTracking,
+)
 from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character
 from litellm.llms.anthropic.cost_calculation import (
    cost_per_token as anthropic_cost_per_token,
@ -57,6 +60,7 @@ from litellm.types.utils import (
    LlmProvidersSet,
    ModelInfo,
    PassthroughCallTypes,
+    StandardBuiltInToolsParams,
    Usage,
 )
 from litellm.utils import (
@ -524,6 +528,7 @@ def completion_cost(  # noqa: PLR0915
    optional_params: Optional[dict] = None,
    custom_pricing: Optional[bool] = None,
    base_model: Optional[str] = None,
+    standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
 ) -> float:
    """
    Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
@ -802,6 +807,12 @@ def completion_cost(  # noqa: PLR0915
            rerank_billed_units=rerank_billed_units,
        )
        _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+        _final_cost += StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
+            model=model,
+            response_object=completion_response,
+            standard_built_in_tools_params=standard_built_in_tools_params,
+            custom_llm_provider=custom_llm_provider,
+        )

        return _final_cost
    except Exception as e:
@ -861,6 +872,7 @@ def response_cost_calculator(
    base_model: Optional[str] = None,
    custom_pricing: Optional[bool] = None,
    prompt: str = "",
+    standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
 ) -> float:
    """
    Returns
@ -890,6 +902,7 @@ def response_cost_calculator(
                custom_pricing=custom_pricing,
                base_model=base_model,
                prompt=prompt,
+                standard_built_in_tools_params=standard_built_in_tools_params,
            )
        return response_cost
    except Exception as e:
--- a/litellm/integrations/gcs_pubsub/pub_sub.py
+++ b/litellm/integrations/gcs_pubsub/pub_sub.py
@ -10,13 +10,16 @@ import asyncio
 import json
 import os
 import traceback
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from litellm.types.utils import StandardLoggingPayload

 if TYPE_CHECKING:
    from litellm.proxy._types import SpendLogsPayload
 else:
    SpendLogsPayload = Any

+import litellm
 from litellm._logging import verbose_logger
 from litellm.integrations.custom_batch_logger import CustomBatchLogger
 from litellm.llms.custom_httpx.http_handler import (
@ -61,7 +64,7 @@ class GcsPubSubLogger(CustomBatchLogger):
        self.flush_lock = asyncio.Lock()
        super().__init__(**kwargs, flush_lock=self.flush_lock)
        asyncio.create_task(self.periodic_flush())
-        self.log_queue: List[SpendLogsPayload] = []
+        self.log_queue: List[Union[SpendLogsPayload, StandardLoggingPayload]] = []

    async def construct_request_headers(self) -> Dict[str, str]:
        """Construct authorization headers using Vertex AI auth"""
@ -115,13 +118,20 @@ class GcsPubSubLogger(CustomBatchLogger):
            verbose_logger.debug(
                "PubSub: Logging - Enters logging function for model %s", kwargs
            )
-            spend_logs_payload = get_logging_payload(
-                kwargs=kwargs,
-                response_obj=response_obj,
-                start_time=start_time,
-                end_time=end_time,
-            )
-            self.log_queue.append(spend_logs_payload)
+            standard_logging_payload = kwargs.get("standard_logging_object", None)
+
+            # Backwards compatibility with old logging payload
+            if litellm.gcs_pub_sub_use_v1 is True:
+                spend_logs_payload = get_logging_payload(
+                    kwargs=kwargs,
+                    response_obj=response_obj,
+                    start_time=start_time,
+                    end_time=end_time,
+                )
+                self.log_queue.append(spend_logs_payload)
+            else:
+                # New logging payload, StandardLoggingPayload
+                self.log_queue.append(standard_logging_payload)

            if len(self.log_queue) >= self.batch_size:
                await self.async_send_batch()
@ -155,7 +165,7 @@ class GcsPubSubLogger(CustomBatchLogger):
            self.log_queue.clear()

    async def publish_message(
-        self, message: SpendLogsPayload
+        self, message: Union[SpendLogsPayload, StandardLoggingPayload]
    ) -> Optional[Dict[str, Any]]:
        """
        Publish message to Google Cloud Pub/Sub using REST API
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -35,6 +35,9 @@ from litellm.integrations.custom_logger import CustomLogger
 from litellm.integrations.mlflow import MlflowLogger
 from litellm.integrations.pagerduty.pagerduty import PagerDutyAlerting
 from litellm.litellm_core_utils.get_litellm_params import get_litellm_params
+from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
+    StandardBuiltInToolCostTracking,
+)
 from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
 from litellm.litellm_core_utils.redact_messages import (
    redact_message_input_output_from_custom_logger,
@ -60,6 +63,7 @@ from litellm.types.utils import (
    ModelResponse,
    ModelResponseStream,
    RawRequestTypedDict,
+    StandardBuiltInToolsParams,
    StandardCallbackDynamicParams,
    StandardLoggingAdditionalHeaders,
    StandardLoggingHiddenParams,
@ -264,7 +268,9 @@ class Logging(LiteLLMLoggingBaseClass):
        self.standard_callback_dynamic_params: StandardCallbackDynamicParams = (
            self.initialize_standard_callback_dynamic_params(kwargs)
        )
-
+        self.standard_built_in_tools_params: StandardBuiltInToolsParams = (
+            self.initialize_standard_built_in_tools_params(kwargs)
+        )
        ## TIME TO FIRST TOKEN LOGGING ##
        self.completion_start_time: Optional[datetime.datetime] = None
        self._llm_caching_handler: Optional[LLMCachingHandler] = None
@ -369,6 +375,23 @@ class Logging(LiteLLMLoggingBaseClass):
        """
        return _initialize_standard_callback_dynamic_params(kwargs)

+    def initialize_standard_built_in_tools_params(
+        self, kwargs: Optional[Dict] = None
+    ) -> StandardBuiltInToolsParams:
+        """
+        Initialize the standard built-in tools params from the kwargs
+
+        checks if web_search_options in kwargs or tools and sets the corresponding attribute in StandardBuiltInToolsParams
+        """
+        return StandardBuiltInToolsParams(
+            web_search_options=StandardBuiltInToolCostTracking._get_web_search_options(
+                kwargs or {}
+            ),
+            file_search=StandardBuiltInToolCostTracking._get_file_search_tool_call(
+                kwargs or {}
+            ),
+        )
+
    def update_environment_variables(
        self,
        litellm_params: Dict,
@ -495,6 +518,16 @@ class Logging(LiteLLMLoggingBaseClass):
                }
        return data

+    def _get_masked_api_base(self, api_base: str) -> str:
+        if "key=" in api_base:
+            # Find the position of "key=" in the string
+            key_index = api_base.find("key=") + 4
+            # Mask the last 5 characters after "key="
+            masked_api_base = api_base[:key_index] + "*" * 5 + api_base[-4:]
+        else:
+            masked_api_base = api_base
+        return str(masked_api_base)
+
    def _pre_call(self, input, api_key, model=None, additional_args={}):
        """
        Common helper function across the sync + async pre-call function
@ -508,6 +541,9 @@ class Logging(LiteLLMLoggingBaseClass):
            model
        ):  # if model name was changes pre-call, overwrite the initial model call name with the new one
            self.model_call_details["model"] = model
+        self.model_call_details["litellm_params"]["api_base"] = (
+            self._get_masked_api_base(additional_args.get("api_base", ""))
+        )

    def pre_call(self, input, api_key, model=None, additional_args={}):  # noqa: PLR0915

@ -691,15 +727,6 @@ class Logging(LiteLLMLoggingBaseClass):
                    headers = {}
                data = additional_args.get("complete_input_dict", {})
                api_base = str(additional_args.get("api_base", ""))
-                if "key=" in api_base:
-                    # Find the position of "key=" in the string
-                    key_index = api_base.find("key=") + 4
-                    # Mask the last 5 characters after "key="
-                    masked_api_base = api_base[:key_index] + "*" * 5 + api_base[-4:]
-                else:
-                    masked_api_base = api_base
-                self.model_call_details["litellm_params"]["api_base"] = masked_api_base
-
                curl_command = self._get_request_curl_command(
                    api_base=api_base,
                    headers=headers,
@ -714,11 +741,12 @@ class Logging(LiteLLMLoggingBaseClass):
    def _get_request_curl_command(
        self, api_base: str, headers: Optional[dict], additional_args: dict, data: dict
    ) -> str:
+        masked_api_base = self._get_masked_api_base(api_base)
        if headers is None:
            headers = {}
        curl_command = "\n\nPOST Request Sent from LiteLLM:\n"
        curl_command += "curl -X POST \\\n"
-        curl_command += f"{api_base} \\\n"
+        curl_command += f"{masked_api_base} \\\n"
        masked_headers = self._get_masked_headers(headers)
        formatted_headers = " ".join(
            [f"-H '{k}: {v}'" for k, v in masked_headers.items()]
@ -903,6 +931,7 @@ class Logging(LiteLLMLoggingBaseClass):
                "optional_params": self.optional_params,
                "custom_pricing": custom_pricing,
                "prompt": prompt,
+                "standard_built_in_tools_params": self.standard_built_in_tools_params,
            }
        except Exception as e:  # error creating kwargs for cost calculation
            debug_info = StandardLoggingModelCostFailureDebugInformation(
@ -1067,6 +1096,7 @@ class Logging(LiteLLMLoggingBaseClass):
                            end_time=end_time,
                            logging_obj=self,
                            status="success",
+                            standard_built_in_tools_params=self.standard_built_in_tools_params,
                        )
                    )
                elif isinstance(result, dict):  # pass-through endpoints
@ -1079,6 +1109,7 @@ class Logging(LiteLLMLoggingBaseClass):
                            end_time=end_time,
                            logging_obj=self,
                            status="success",
+                            standard_built_in_tools_params=self.standard_built_in_tools_params,
                        )
                    )
            elif standard_logging_object is not None:
@ -1102,6 +1133,7 @@ class Logging(LiteLLMLoggingBaseClass):
                    prompt="",
                    completion=getattr(result, "content", ""),
                    total_time=float_diff,
+                    standard_built_in_tools_params=self.standard_built_in_tools_params,
                )

            return start_time, end_time, result
@ -1155,6 +1187,7 @@ class Logging(LiteLLMLoggingBaseClass):
                        end_time=end_time,
                        logging_obj=self,
                        status="success",
+                        standard_built_in_tools_params=self.standard_built_in_tools_params,
                    )
                )
            callbacks = self.get_combined_callback_list(
@ -1695,6 +1728,7 @@ class Logging(LiteLLMLoggingBaseClass):
                    end_time=end_time,
                    logging_obj=self,
                    status="success",
+                    standard_built_in_tools_params=self.standard_built_in_tools_params,
                )
            )
        callbacks = self.get_combined_callback_list(
@ -1911,6 +1945,7 @@ class Logging(LiteLLMLoggingBaseClass):
                status="failure",
                error_str=str(exception),
                original_exception=exception,
+                standard_built_in_tools_params=self.standard_built_in_tools_params,
            )
        )
        return start_time, end_time
@ -3367,6 +3402,7 @@ def get_standard_logging_object_payload(
    status: StandardLoggingPayloadStatus,
    error_str: Optional[str] = None,
    original_exception: Optional[Exception] = None,
+    standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
 ) -> Optional[StandardLoggingPayload]:
    try:
        kwargs = kwargs or {}
@ -3542,6 +3578,7 @@ def get_standard_logging_object_payload(
            guardrail_information=metadata.get(
                "standard_logging_guardrail_information", None
            ),
+            standard_built_in_tools_params=standard_built_in_tools_params,
        )

        emit_standard_logging_payload(payload)
--- a/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py
@ -0,0 +1,199 @@
+"""
+Helper utilities for tracking the cost of built-in tools.
+"""
+
+from typing import Any, Dict, List, Optional
+
+import litellm
+from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
+from litellm.types.utils import (
+    ModelInfo,
+    ModelResponse,
+    SearchContextCostPerQuery,
+    StandardBuiltInToolsParams,
+)
+
+
+class StandardBuiltInToolCostTracking:
+    """
+    Helper class for tracking the cost of built-in tools
+
+    Example: Web Search
+    """
+
+    @staticmethod
+    def get_cost_for_built_in_tools(
+        model: str,
+        response_object: Any,
+        custom_llm_provider: Optional[str] = None,
+        standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
+    ) -> float:
+        """
+        Get the cost of using built-in tools.
+
+        Supported tools:
+        - Web Search
+
+        """
+        if standard_built_in_tools_params is not None:
+            if (
+                standard_built_in_tools_params.get("web_search_options", None)
+                is not None
+            ):
+                model_info = StandardBuiltInToolCostTracking._safe_get_model_info(
+                    model=model, custom_llm_provider=custom_llm_provider
+                )
+
+                return StandardBuiltInToolCostTracking.get_cost_for_web_search(
+                    web_search_options=standard_built_in_tools_params.get(
+                        "web_search_options", None
+                    ),
+                    model_info=model_info,
+                )
+
+            if standard_built_in_tools_params.get("file_search", None) is not None:
+                return StandardBuiltInToolCostTracking.get_cost_for_file_search(
+                    file_search=standard_built_in_tools_params.get("file_search", None),
+                )
+
+        if isinstance(response_object, ModelResponse):
+            if StandardBuiltInToolCostTracking.chat_completion_response_includes_annotations(
+                response_object
+            ):
+                model_info = StandardBuiltInToolCostTracking._safe_get_model_info(
+                    model=model, custom_llm_provider=custom_llm_provider
+                )
+                return StandardBuiltInToolCostTracking.get_default_cost_for_web_search(
+                    model_info
+                )
+        return 0.0
+
+    @staticmethod
+    def _safe_get_model_info(
+        model: str, custom_llm_provider: Optional[str] = None
+    ) -> Optional[ModelInfo]:
+        try:
+            return litellm.get_model_info(
+                model=model, custom_llm_provider=custom_llm_provider
+            )
+        except Exception:
+            return None
+
+    @staticmethod
+    def get_cost_for_web_search(
+        web_search_options: Optional[WebSearchOptions] = None,
+        model_info: Optional[ModelInfo] = None,
+    ) -> float:
+        """
+        If request includes `web_search_options`, calculate the cost of the web search.
+        """
+        if web_search_options is None:
+            return 0.0
+        if model_info is None:
+            return 0.0
+
+        search_context_pricing: SearchContextCostPerQuery = (
+            model_info.get("search_context_cost_per_query", {}) or {}
+        )
+        if web_search_options.get("search_context_size", None) == "low":
+            return search_context_pricing.get("search_context_size_low", 0.0)
+        elif web_search_options.get("search_context_size", None) == "medium":
+            return search_context_pricing.get("search_context_size_medium", 0.0)
+        elif web_search_options.get("search_context_size", None) == "high":
+            return search_context_pricing.get("search_context_size_high", 0.0)
+        return StandardBuiltInToolCostTracking.get_default_cost_for_web_search(
+            model_info
+        )
+
+    @staticmethod
+    def get_default_cost_for_web_search(
+        model_info: Optional[ModelInfo] = None,
+    ) -> float:
+        """
+        If no web search options are provided, use the `search_context_size_medium` pricing.
+
+        https://platform.openai.com/docs/pricing#web-search
+        """
+        if model_info is None:
+            return 0.0
+        search_context_pricing: SearchContextCostPerQuery = (
+            model_info.get("search_context_cost_per_query", {}) or {}
+        ) or {}
+        return search_context_pricing.get("search_context_size_medium", 0.0)
+
+    @staticmethod
+    def get_cost_for_file_search(
+        file_search: Optional[FileSearchTool] = None,
+    ) -> float:
+        """ "
+        Charged at $2.50/1k calls
+
+        Doc: https://platform.openai.com/docs/pricing#built-in-tools
+        """
+        if file_search is None:
+            return 0.0
+        return 2.5 / 1000
+
+    @staticmethod
+    def chat_completion_response_includes_annotations(
+        response_object: ModelResponse,
+    ) -> bool:
+        for _choice in response_object.choices:
+            message = getattr(_choice, "message", None)
+            if (
+                message is not None
+                and hasattr(message, "annotations")
+                and message.annotations is not None
+                and len(message.annotations) > 0
+            ):
+                return True
+        return False
+
+    @staticmethod
+    def _get_web_search_options(kwargs: Dict) -> Optional[WebSearchOptions]:
+        if "web_search_options" in kwargs:
+            return WebSearchOptions(**kwargs.get("web_search_options", {}))
+
+        tools = StandardBuiltInToolCostTracking._get_tools_from_kwargs(
+            kwargs, "web_search_preview"
+        )
+        if tools:
+            # Look for web search tool in the tools array
+            for tool in tools:
+                if isinstance(tool, dict):
+                    if StandardBuiltInToolCostTracking._is_web_search_tool_call(tool):
+                        return WebSearchOptions(**tool)
+        return None
+
+    @staticmethod
+    def _get_tools_from_kwargs(kwargs: Dict, tool_type: str) -> Optional[List[Dict]]:
+        if "tools" in kwargs:
+            tools = kwargs.get("tools", [])
+            return tools
+        return None
+
+    @staticmethod
+    def _get_file_search_tool_call(kwargs: Dict) -> Optional[FileSearchTool]:
+        tools = StandardBuiltInToolCostTracking._get_tools_from_kwargs(
+            kwargs, "file_search"
+        )
+        if tools:
+            for tool in tools:
+                if isinstance(tool, dict):
+                    if StandardBuiltInToolCostTracking._is_file_search_tool_call(tool):
+                        return FileSearchTool(**tool)
+        return None
+
+    @staticmethod
+    def _is_web_search_tool_call(tool: Dict) -> bool:
+        if tool.get("type", None) == "web_search_preview":
+            return True
+        if "search_context_size" in tool:
+            return True
+        return False
+
+    @staticmethod
+    def _is_file_search_tool_call(tool: Dict) -> bool:
+        if tool.get("type", None) == "file_search":
+            return True
+        return False
--- a/litellm/litellm_core_utils/model_param_helper.py
+++ b/litellm/litellm_core_utils/model_param_helper.py
@ -138,13 +138,22 @@ class ModelParamHelper:
                TranscriptionCreateParamsNonStreaming,
                TranscriptionCreateParamsStreaming,
            )
-            non_streaming_kwargs = set(getattr(TranscriptionCreateParamsNonStreaming, "__annotations__", {}).keys())
-            streaming_kwargs = set(getattr(TranscriptionCreateParamsStreaming, "__annotations__", {}).keys())
+
+            non_streaming_kwargs = set(
+                getattr(
+                    TranscriptionCreateParamsNonStreaming, "__annotations__", {}
+                ).keys()
+            )
+            streaming_kwargs = set(
+                getattr(
+                    TranscriptionCreateParamsStreaming, "__annotations__", {}
+                ).keys()
+            )

            all_transcription_kwargs = non_streaming_kwargs.union(streaming_kwargs)
            return all_transcription_kwargs
        except Exception as e:
-            verbose_logger.warning("Error getting transcription kwargs %s", str(e))
+            verbose_logger.debug("Error getting transcription kwargs %s", str(e))
            return set()

    @staticmethod
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -5304,6 +5304,17 @@
        "mode": "embedding",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
    },
+    "text-embedding-large-exp-03-07": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "output_vector_size": 3072,
+        "input_cost_per_character": 0.000000025,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0,
+        "litellm_provider": "vertex_ai-embedding-models",
+        "mode": "embedding",
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
+    },
    "textembedding-gecko": {
        "max_tokens": 3072,
        "max_input_tokens": 3072,
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -5,7 +5,10 @@ model_list:
    api_key: os.environ/AZURE_API_KEY
    api_base: http://0.0.0.0:8090
    rpm: 3
-    
+ - model_name: "gpt-4o-mini-openai"
+   litellm_params:
+    model: gpt-4o-mini
+    api_key: os.environ/OPENAI_API_KEY
 litellm_settings:
  num_retries: 0

--- a/litellm/proxy/pass_through_endpoints/llm_passthrough_endpoints.py
+++ b/litellm/proxy/pass_through_endpoints/llm_passthrough_endpoints.py
@ -542,13 +542,10 @@ async def vertex_proxy_route(
            user_api_key_dict,
            stream=is_streaming_request,  # type: ignore
        )
-    except Exception as e:
+    except ProxyException as e:
        if headers_passed_through:
-            raise Exception(
-                f"No credentials found on proxy for this request. Headers were passed through directly but request failed with error: {str(e)}"
-            )
-        else:
-            raise e
+            e.message = f"No credentials found on proxy for project_name={vertex_project} + location={vertex_location}, check `/model/info` for allowed project + region combinations with `use_in_pass_through: true`. Headers were passed through directly but request failed with error: {e.message}"
+        raise e

    return received_value

--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -1788,9 +1788,6 @@ class ProxyConfig:
                            reset_color_code,
                            cache_password,
                        )
-                    if cache_type == "redis-semantic":
-                        # by default this should always be async
-                        cache_params.update({"redis_semantic_cache_use_async": True})

                    # users can pass os.environ/ variables on the proxy - we should read them from the env
                    for key, value in cache_params.items():
@ -6181,18 +6178,18 @@ async def model_info_v1(  # noqa: PLR0915
    )

    if len(all_models_str) > 0:
-        model_names = all_models_str
-        llm_model_list = llm_router.get_model_list()
+        _relevant_models = []
+        for model in all_models_str:
+            router_models = llm_router.get_model_list(model_name=model)
+            if router_models is not None:
+                _relevant_models.extend(router_models)
        if llm_model_list is not None:
-            _relevant_models = [
-                m for m in llm_model_list if m["model_name"] in model_names
-            ]
            all_models = copy.deepcopy(_relevant_models)  # type: ignore
        else:
            all_models = []

-    for model in all_models:
-        model = _get_proxy_model_info(model=model)
+    for in_place_model in all_models:
+        in_place_model = _get_proxy_model_info(model=in_place_model)

    verbose_proxy_logger.debug("all_models: %s", all_models)
    return {"data": all_models}
--- a/litellm/router.py
+++ b/litellm/router.py
@ -4924,6 +4924,11 @@ class Router:
                    and model_info["supports_function_calling"] is True  # type: ignore
                ):
                    model_group_info.supports_function_calling = True
+                if (
+                    model_info.get("supports_web_search", None) is not None
+                    and model_info["supports_web_search"] is True  # type: ignore
+                ):
+                    model_group_info.supports_web_search = True
                if (
                    model_info.get("supported_openai_params", None) is not None
                    and model_info["supported_openai_params"] is not None
@ -5286,10 +5291,11 @@ class Router:

            if len(returned_models) == 0:  # check if wildcard route
                potential_wildcard_models = self.pattern_router.route(model_name)
-                if potential_wildcard_models is not None:
-                    returned_models.extend(
-                        [DeploymentTypedDict(**m) for m in potential_wildcard_models]  # type: ignore
-                    )
+                if model_name is not None and potential_wildcard_models is not None:
+                    for m in potential_wildcard_models:
+                        deployment_typed_dict = DeploymentTypedDict(**m)  # type: ignore
+                        deployment_typed_dict["model_name"] = model_name
+                        returned_models.append(deployment_typed_dict)

            if model_name is None:
                returned_models += self.model_list
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@ -382,6 +382,53 @@ class ChatCompletionThinkingBlock(TypedDict, total=False):
    cache_control: Optional[Union[dict, ChatCompletionCachedContent]]


+class WebSearchOptionsUserLocationApproximate(TypedDict, total=False):
+    city: str
+    """Free text input for the city of the user, e.g. `San Francisco`."""
+
+    country: str
+    """
+    The two-letter [ISO country code](https://en.wikipedia.org/wiki/ISO_3166-1) of
+    the user, e.g. `US`.
+    """
+
+    region: str
+    """Free text input for the region of the user, e.g. `California`."""
+
+    timezone: str
+    """
+    The [IANA timezone](https://timeapi.io/documentation/iana-timezones) of the
+    user, e.g. `America/Los_Angeles`.
+    """
+
+
+class WebSearchOptionsUserLocation(TypedDict, total=False):
+    approximate: Required[WebSearchOptionsUserLocationApproximate]
+    """Approximate location parameters for the search."""
+
+    type: Required[Literal["approximate"]]
+    """The type of location approximation. Always `approximate`."""
+
+
+class WebSearchOptions(TypedDict, total=False):
+    search_context_size: Literal["low", "medium", "high"]
+    """
+    High level guidance for the amount of context window space to use for the
+    search. One of `low`, `medium`, or `high`. `medium` is the default.
+    """
+
+    user_location: Optional[WebSearchOptionsUserLocation]
+    """Approximate location parameters for the search."""
+
+
+class FileSearchTool(TypedDict, total=False):
+    type: Literal["file_search"]
+    """The type of tool being defined: `file_search`"""
+
+    vector_store_ids: Optional[List[str]]
+    """The IDs of the vector stores to search."""
+
+
 class ChatCompletionAnnotationURLCitation(TypedDict, total=False):
    end_index: int
    """The index of the last character of the URL citation in the message."""
--- a/litellm/types/router.py
+++ b/litellm/types/router.py
@ -559,6 +559,7 @@ class ModelGroupInfo(BaseModel):
    rpm: Optional[int] = None
    supports_parallel_function_calling: bool = Field(default=False)
    supports_vision: bool = Field(default=False)
+    supports_web_search: bool = Field(default=False)
    supports_function_calling: bool = Field(default=False)
    supported_openai_params: Optional[List[str]] = Field(default=[])
    configurable_clientside_auth_params: CONFIGURABLE_CLIENTSIDE_AUTH_PARAMS = None
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -32,7 +32,9 @@ from .llms.openai import (
    ChatCompletionThinkingBlock,
    ChatCompletionToolCallChunk,
    ChatCompletionUsageBlock,
+    FileSearchTool,
    OpenAIChatCompletionChunk,
+    WebSearchOptions,
 )
 from .rerank import RerankResponse

@ -97,6 +99,13 @@ class ProviderSpecificModelInfo(TypedDict, total=False):
    supports_pdf_input: Optional[bool]
    supports_native_streaming: Optional[bool]
    supports_parallel_function_calling: Optional[bool]
+    supports_web_search: Optional[bool]
+
+
+class SearchContextCostPerQuery(TypedDict, total=False):
+    search_context_size_low: float
+    search_context_size_medium: float
+    search_context_size_high: float


 class ModelInfoBase(ProviderSpecificModelInfo, total=False):
@ -135,6 +144,9 @@ class ModelInfoBase(ProviderSpecificModelInfo, total=False):
    output_cost_per_video_per_second: Optional[float]  # only for vertex ai models
    output_cost_per_audio_per_second: Optional[float]  # only for vertex ai models
    output_cost_per_second: Optional[float]  # for OpenAI Speech models
+    search_context_cost_per_query: Optional[
+        SearchContextCostPerQuery
+    ]  # Cost for using web search tool

    litellm_provider: Required[str]
    mode: Required[
@ -586,6 +598,11 @@ class Message(OpenAIObject):
            # OpenAI compatible APIs like mistral API will raise an error if audio is passed in
            del self.audio

+        if annotations is None:
+            # ensure default response matches OpenAI spec
+            # Some OpenAI compatible APIs raise an error if annotations are passed in
+            del self.annotations
+
        if reasoning_content is None:
            # ensure default response matches OpenAI spec
            del self.reasoning_content
@ -1612,6 +1629,19 @@ class StandardLoggingUserAPIKeyMetadata(TypedDict):
    user_api_key_end_user_id: Optional[str]


+class StandardBuiltInToolsParams(TypedDict, total=False):
+    """
+    Standard built-in OpenAItools parameters
+
+    This is used to calculate the cost of built-in tools, insert any standard built-in tools parameters here
+
+    OpenAI charges users based on the `web_search_options` parameter
+    """
+
+    web_search_options: Optional[WebSearchOptions]
+    file_search: Optional[FileSearchTool]
+
+
 class StandardLoggingPromptManagementMetadata(TypedDict):
    prompt_id: str
    prompt_variables: Optional[dict]
@ -1729,6 +1759,7 @@ class StandardLoggingPayload(TypedDict):
    model_parameters: dict
    hidden_params: StandardLoggingHiddenParams
    guardrail_information: Optional[StandardLoggingGuardrailInformation]
+    standard_built_in_tools_params: Optional[StandardBuiltInToolsParams]


 from typing import AsyncIterator, Iterator
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -1975,7 +1975,7 @@ def supports_system_messages(model: str, custom_llm_provider: Optional[str]) ->
    )


-def supports_web_search(model: str, custom_llm_provider: Optional[str]) -> bool:
+def supports_web_search(model: str, custom_llm_provider: Optional[str] = None) -> bool:
    """
    Check if the given model supports web search and return a boolean value.

@ -4544,6 +4544,10 @@ def _get_model_info_helper(  # noqa: PLR0915
                supports_native_streaming=_model_info.get(
                    "supports_native_streaming", None
                ),
+                supports_web_search=_model_info.get("supports_web_search", False),
+                search_context_cost_per_query=_model_info.get(
+                    "search_context_cost_per_query", None
+                ),
                tpm=_model_info.get("tpm", None),
                rpm=_model_info.get("rpm", None),
            )
@ -4612,6 +4616,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
            supports_audio_input: Optional[bool]
            supports_audio_output: Optional[bool]
            supports_pdf_input: Optional[bool]
+            supports_web_search: Optional[bool]
    Raises:
        Exception: If the model is not mapped yet.

--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -5304,6 +5304,17 @@
        "mode": "embedding",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
    },
+    "text-embedding-large-exp-03-07": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "output_vector_size": 3072,
+        "input_cost_per_character": 0.000000025,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0,
+        "litellm_provider": "vertex_ai-embedding-models",
+        "mode": "embedding",
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
+    },
    "textembedding-gecko": {
        "max_tokens": 3072,
        "max_input_tokens": 3072,
--- a/poetry.lock
+++ b/poetry.lock
@ -810,15 +810,15 @@ test = ["pytest (>=6)"]

 [[package]]
 name = "fastapi"
-version = "0.115.11"
+version = "0.115.12"
 description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
 optional = true
 python-versions = ">=3.8"
 groups = ["main"]
 markers = "extra == \"proxy\""
 files = [
-    {file = "fastapi-0.115.11-py3-none-any.whl", hash = "sha256:32e1541b7b74602e4ef4a0260ecaf3aadf9d4f19590bba3e1bf2ac4666aa2c64"},
-    {file = "fastapi-0.115.11.tar.gz", hash = "sha256:cc81f03f688678b92600a65a5e618b93592c65005db37157147204d8924bf94f"},
+    {file = "fastapi-0.115.12-py3-none-any.whl", hash = "sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d"},
+    {file = "fastapi-0.115.12.tar.gz", hash = "sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681"},
 ]

 [package.dependencies]
@ -1445,14 +1445,14 @@ type = ["pytest-mypy"]

 [[package]]
 name = "iniconfig"
-version = "2.0.0"
+version = "2.1.0"
 description = "brain-dead simple config-ini parsing"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 groups = ["dev"]
 files = [
-    {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
-    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
+    {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"},
+    {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"},
 ]

 [[package]]
@ -2137,14 +2137,14 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]

 [[package]]
 name = "openai"
-version = "1.66.3"
+version = "1.68.2"
 description = "The official Python library for the openai API"
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "openai-1.66.3-py3-none-any.whl", hash = "sha256:a427c920f727711877ab17c11b95f1230b27767ba7a01e5b66102945141ceca9"},
-    {file = "openai-1.66.3.tar.gz", hash = "sha256:8dde3aebe2d081258d4159c4cb27bdc13b5bb3f7ea2201d9bd940b9a89faf0c9"},
+    {file = "openai-1.68.2-py3-none-any.whl", hash = "sha256:24484cb5c9a33b58576fdc5acf0e5f92603024a4e39d0b99793dfa1eb14c2b36"},
+    {file = "openai-1.68.2.tar.gz", hash = "sha256:b720f0a95a1dbe1429c0d9bb62096a0d98057bcda82516f6e8af10284bdd5b19"},
 ]

 [package.dependencies]
@ -2160,6 +2160,7 @@ typing-extensions = ">=4.11,<5"
 [package.extras]
 datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
 realtime = ["websockets (>=13,<15)"]
+voice-helpers = ["numpy (>=2.0.2)", "sounddevice (>=0.5.1)"]

 [[package]]
 name = "orjson"
@ -2477,24 +2478,24 @@ testing = ["google-api-core (>=1.31.5)"]

 [[package]]
 name = "protobuf"
-version = "5.29.3"
+version = "5.29.4"
 description = ""
 optional = true
 python-versions = ">=3.8"
 groups = ["main"]
 markers = "extra == \"extra-proxy\""
 files = [
-    {file = "protobuf-5.29.3-cp310-abi3-win32.whl", hash = "sha256:3ea51771449e1035f26069c4c7fd51fba990d07bc55ba80701c78f886bf9c888"},
-    {file = "protobuf-5.29.3-cp310-abi3-win_amd64.whl", hash = "sha256:a4fa6f80816a9a0678429e84973f2f98cbc218cca434abe8db2ad0bffc98503a"},
-    {file = "protobuf-5.29.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a8434404bbf139aa9e1300dbf989667a83d42ddda9153d8ab76e0d5dcaca484e"},
-    {file = "protobuf-5.29.3-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:daaf63f70f25e8689c072cfad4334ca0ac1d1e05a92fc15c54eb9cf23c3efd84"},
-    {file = "protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:c027e08a08be10b67c06bf2370b99c811c466398c357e615ca88c91c07f0910f"},
-    {file = "protobuf-5.29.3-cp38-cp38-win32.whl", hash = "sha256:84a57163a0ccef3f96e4b6a20516cedcf5bb3a95a657131c5c3ac62200d23252"},
-    {file = "protobuf-5.29.3-cp38-cp38-win_amd64.whl", hash = "sha256:b89c115d877892a512f79a8114564fb435943b59067615894c3b13cd3e1fa107"},
-    {file = "protobuf-5.29.3-cp39-cp39-win32.whl", hash = "sha256:0eb32bfa5219fc8d4111803e9a690658aa2e6366384fd0851064b963b6d1f2a7"},
-    {file = "protobuf-5.29.3-cp39-cp39-win_amd64.whl", hash = "sha256:6ce8cc3389a20693bfde6c6562e03474c40851b44975c9b2bf6df7d8c4f864da"},
-    {file = "protobuf-5.29.3-py3-none-any.whl", hash = "sha256:0a18ed4a24198528f2333802eb075e59dea9d679ab7a6c5efb017a59004d849f"},
-    {file = "protobuf-5.29.3.tar.gz", hash = "sha256:5da0f41edaf117bde316404bad1a486cb4ededf8e4a54891296f648e8e076620"},
+    {file = "protobuf-5.29.4-cp310-abi3-win32.whl", hash = "sha256:13eb236f8eb9ec34e63fc8b1d6efd2777d062fa6aaa68268fb67cf77f6839ad7"},
+    {file = "protobuf-5.29.4-cp310-abi3-win_amd64.whl", hash = "sha256:bcefcdf3976233f8a502d265eb65ea740c989bacc6c30a58290ed0e519eb4b8d"},
+    {file = "protobuf-5.29.4-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:307ecba1d852ec237e9ba668e087326a67564ef83e45a0189a772ede9e854dd0"},
+    {file = "protobuf-5.29.4-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:aec4962f9ea93c431d5714ed1be1c93f13e1a8618e70035ba2b0564d9e633f2e"},
+    {file = "protobuf-5.29.4-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:d7d3f7d1d5a66ed4942d4fefb12ac4b14a29028b209d4bfb25c68ae172059922"},
+    {file = "protobuf-5.29.4-cp38-cp38-win32.whl", hash = "sha256:1832f0515b62d12d8e6ffc078d7e9eb06969aa6dc13c13e1036e39d73bebc2de"},
+    {file = "protobuf-5.29.4-cp38-cp38-win_amd64.whl", hash = "sha256:476cb7b14914c780605a8cf62e38c2a85f8caff2e28a6a0bad827ec7d6c85d68"},
+    {file = "protobuf-5.29.4-cp39-cp39-win32.whl", hash = "sha256:fd32223020cb25a2cc100366f1dedc904e2d71d9322403224cdde5fdced0dabe"},
+    {file = "protobuf-5.29.4-cp39-cp39-win_amd64.whl", hash = "sha256:678974e1e3a9b975b8bc2447fca458db5f93a2fb6b0c8db46b6675b5b5346812"},
+    {file = "protobuf-5.29.4-py3-none-any.whl", hash = "sha256:3fde11b505e1597f71b875ef2fc52062b6a9740e5f7c8997ce878b6009145862"},
+    {file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"},
 ]

 [[package]]
@ -2809,6 +2810,25 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 [package.extras]
 testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]

+[[package]]
+name = "pytest-asyncio"
+version = "0.21.2"
+description = "Pytest support for asyncio"
+optional = false
+python-versions = ">=3.7"
+groups = ["dev"]
+files = [
+    {file = "pytest_asyncio-0.21.2-py3-none-any.whl", hash = "sha256:ab664c88bb7998f711d8039cacd4884da6430886ae8bbd4eded552ed2004f16b"},
+    {file = "pytest_asyncio-0.21.2.tar.gz", hash = "sha256:d67738fc232b94b326b9d060750beb16e0074210b98dd8b58a5239fa2a154f45"},
+]
+
+[package.dependencies]
+pytest = ">=7.0.0"
+
+[package.extras]
+docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"]
+testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"]
+
 [[package]]
 name = "pytest-mock"
 version = "3.14.0"
@ -3279,15 +3299,15 @@ files = [

 [[package]]
 name = "rq"
-version = "2.1.0"
+version = "2.2.0"
 description = "RQ is a simple, lightweight, library for creating background jobs, and processing them."
 optional = true
 python-versions = ">=3.8"
 groups = ["main"]
 markers = "extra == \"proxy\""
 files = [
-    {file = "rq-2.1.0-py3-none-any.whl", hash = "sha256:3c6892c6ca848e5fb47c1875399a66f13656bf0e123bf725d9aa9a12718e2fdf"},
-    {file = "rq-2.1.0.tar.gz", hash = "sha256:764585b6cab69ef1412f4aee523347e5aa7ece3ca175c118b1d92223dd8c2826"},
+    {file = "rq-2.2.0-py3-none-any.whl", hash = "sha256:dacbfe1ccb79a45c8cd95dec7951620679fa0195570b63da3f9347622d33accc"},
+    {file = "rq-2.2.0.tar.gz", hash = "sha256:b636760f1e4c183022031c142faa0483e687885824e9732ba2953f994104e203"},
 ]

 [package.dependencies]
@ -3606,15 +3626,15 @@ files = [

 [[package]]
 name = "tzdata"
-version = "2025.1"
+version = "2025.2"
 description = "Provider of IANA time zone data"
 optional = true
 python-versions = ">=2"
 groups = ["main"]
 markers = "extra == \"proxy\" and platform_system == \"Windows\""
 files = [
-    {file = "tzdata-2025.1-py2.py3-none-any.whl", hash = "sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639"},
-    {file = "tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694"},
+    {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"},
+    {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"},
 ]

 [[package]]
@ -3985,4 +4005,4 @@ proxy = ["PyJWT", "apscheduler", "backoff", "boto3", "cryptography", "fastapi",
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.8.1,<4.0, !=3.9.7"
-content-hash = "f7c21b3d659e4a15cd46bb42fb905ad039028f4f6b82507fd1278ac05c412569"
+content-hash = "9c863b11189227a035a9130c8872de44fe7c5e1e32b47569a56af86e3f6570c5"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.63.14"
+version = "1.64.0"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -98,13 +98,14 @@ black = "^23.12.0"
 mypy = "^1.0"
 pytest = "^7.4.3"
 pytest-mock = "^3.12.0"
+pytest-asyncio = "^0.21.1"

 [build-system]
 requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.63.14"
+version = "1.64.0"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -9,8 +9,8 @@ uvicorn==0.29.0 # server dep
 gunicorn==23.0.0 # server dep
 uvloop==0.21.0 # uvicorn dep, gives us much better performance under load
 boto3==1.34.34 # aws bedrock/sagemaker calls
-redis==5.0.0 # caching
-numpy==2.1.1 # semantic caching
+redis==5.2.1 # redis caching
+redisvl==0.4.1 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 pynacl==1.5.0 # for encrypting keys
--- a/tests/litellm/caching/test_redis_cache.py
+++ b/tests/litellm/caching/test_redis_cache.py
@ -1,13 +1,8 @@
-import asyncio
-import json
 import os
 import sys
-import time
 from unittest.mock import MagicMock, patch

-import httpx
 import pytest
-import respx
 from fastapi.testclient import TestClient

 sys.path.insert(
@ -18,9 +13,18 @@ from unittest.mock import AsyncMock
 from litellm.caching.redis_cache import RedisCache


+@pytest.fixture
+def redis_no_ping():
+    """Patch RedisCache initialization to prevent async ping tasks from being created"""
+    with patch('asyncio.get_running_loop') as mock_get_loop:
+        # Either raise an exception or return a mock that will handle the task creation
+        mock_get_loop.side_effect = RuntimeError("No running event loop")
+        yield
+
+
@pytest.mark.parametrize("namespace", [None, "test"])
@pytest.mark.asyncio
-async def test_redis_cache_async_increment(namespace, monkeypatch):
+async def test_redis_cache_async_increment(namespace, monkeypatch, redis_no_ping):
    monkeypatch.setenv("REDIS_HOST", "https://my-test-host")
    redis_cache = RedisCache(namespace=namespace)
    # Create an AsyncMock for the Redis client
@ -47,10 +51,46 @@ async def test_redis_cache_async_increment(namespace, monkeypatch):


@pytest.mark.asyncio
-async def test_redis_client_init_with_socket_timeout(monkeypatch):
+async def test_redis_client_init_with_socket_timeout(monkeypatch, redis_no_ping):
    monkeypatch.setenv("REDIS_HOST", "my-fake-host")
    redis_cache = RedisCache(socket_timeout=1.0)
    assert redis_cache.redis_kwargs["socket_timeout"] == 1.0
    client = redis_cache.init_async_client()
    assert client is not None
    assert client.connection_pool.connection_kwargs["socket_timeout"] == 1.0
+
+
+@pytest.mark.asyncio
+async def test_redis_cache_async_batch_get_cache(monkeypatch, redis_no_ping):
+    monkeypatch.setenv("REDIS_HOST", "https://my-test-host")
+    redis_cache = RedisCache()
+    
+    # Create an AsyncMock for the Redis client
+    mock_redis_instance = AsyncMock()
+    
+    # Make sure the mock can be used as an async context manager
+    mock_redis_instance.__aenter__.return_value = mock_redis_instance
+    mock_redis_instance.__aexit__.return_value = None
+    
+    # Setup the return value for mget
+    mock_redis_instance.mget.return_value = [
+        b'{"key1": "value1"}',
+        None,
+        b'{"key3": "value3"}'
+    ]
+    
+    test_keys = ["key1", "key2", "key3"]
+    
+    with patch.object(
+        redis_cache, "init_async_client", return_value=mock_redis_instance
+    ):
+        # Call async_batch_get_cache
+        result = await redis_cache.async_batch_get_cache(key_list=test_keys)
+        
+        # Verify mget was called with the correct keys
+        mock_redis_instance.mget.assert_called_once()
+        
+        # Check that results were properly decoded
+        assert result["key1"] == {"key1": "value1"}
+        assert result["key2"] is None
+        assert result["key3"] == {"key3": "value3"}
--- a/tests/litellm/caching/test_redis_semantic_cache.py
+++ b/tests/litellm/caching/test_redis_semantic_cache.py
@ -0,0 +1,130 @@
+import os
+import sys
+from unittest.mock import MagicMock, patch, AsyncMock
+
+import pytest
+
+sys.path.insert(
+    0, os.path.abspath("../../..")
+)  # Adds the parent directory to the system path
+
+
+# Tests for RedisSemanticCache
+def test_redis_semantic_cache_initialization(monkeypatch):
+    # Mock the redisvl import
+    semantic_cache_mock = MagicMock()
+    with patch.dict("sys.modules", {
+        "redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
+        "redisvl.utils.vectorize": MagicMock(CustomTextVectorizer=MagicMock())
+    }):
+        from litellm.caching.redis_semantic_cache import RedisSemanticCache
+        
+        # Set environment variables
+        monkeypatch.setenv("REDIS_HOST", "localhost")
+        monkeypatch.setenv("REDIS_PORT", "6379")
+        monkeypatch.setenv("REDIS_PASSWORD", "test_password")
+        
+        # Initialize the cache with a similarity threshold
+        redis_semantic_cache = RedisSemanticCache(similarity_threshold=0.8)
+        
+        # Verify the semantic cache was initialized with correct parameters
+        assert redis_semantic_cache.similarity_threshold == 0.8
+        
+        # Use pytest.approx for floating point comparison to handle precision issues
+        assert redis_semantic_cache.distance_threshold == pytest.approx(0.2, abs=1e-10)
+        assert redis_semantic_cache.embedding_model == "text-embedding-ada-002"
+        
+        # Test initialization with missing similarity_threshold
+        with pytest.raises(ValueError, match="similarity_threshold must be provided"):
+            RedisSemanticCache()
+
+
+def test_redis_semantic_cache_get_cache(monkeypatch):
+    # Mock the redisvl import and embedding function
+    semantic_cache_mock = MagicMock()
+    custom_vectorizer_mock = MagicMock()
+    
+    with patch.dict("sys.modules", {
+        "redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
+        "redisvl.utils.vectorize": MagicMock(CustomTextVectorizer=custom_vectorizer_mock)
+    }):
+        from litellm.caching.redis_semantic_cache import RedisSemanticCache
+        
+        # Set environment variables
+        monkeypatch.setenv("REDIS_HOST", "localhost")
+        monkeypatch.setenv("REDIS_PORT", "6379")
+        monkeypatch.setenv("REDIS_PASSWORD", "test_password")
+        
+        # Initialize cache
+        redis_semantic_cache = RedisSemanticCache(similarity_threshold=0.8)
+        
+        # Mock the llmcache.check method to return a result
+        mock_result = [
+            {
+                "prompt": "What is the capital of France?",
+                "response": '{"content": "Paris is the capital of France."}',
+                "vector_distance": 0.1  # Distance of 0.1 means similarity of 0.9
+            }
+        ]
+        redis_semantic_cache.llmcache.check = MagicMock(return_value=mock_result)
+        
+        # Mock the embedding function
+        with patch("litellm.embedding", return_value={"data": [{"embedding": [0.1, 0.2, 0.3]}]}):
+            # Test get_cache with a message
+            result = redis_semantic_cache.get_cache(
+                key="test_key",
+                messages=[{"content": "What is the capital of France?"}]
+            )
+            
+            # Verify result is properly parsed
+            assert result == {"content": "Paris is the capital of France."}
+            
+            # Verify llmcache.check was called
+            redis_semantic_cache.llmcache.check.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_redis_semantic_cache_async_get_cache(monkeypatch):
+    # Mock the redisvl import
+    semantic_cache_mock = MagicMock()
+    custom_vectorizer_mock = MagicMock()
+    
+    with patch.dict("sys.modules", {
+        "redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
+        "redisvl.utils.vectorize": MagicMock(CustomTextVectorizer=custom_vectorizer_mock)
+    }):
+        from litellm.caching.redis_semantic_cache import RedisSemanticCache
+        
+        # Set environment variables
+        monkeypatch.setenv("REDIS_HOST", "localhost")
+        monkeypatch.setenv("REDIS_PORT", "6379")
+        monkeypatch.setenv("REDIS_PASSWORD", "test_password")
+        
+        # Initialize cache
+        redis_semantic_cache = RedisSemanticCache(similarity_threshold=0.8)
+        
+        # Mock the async methods
+        mock_result = [
+            {
+                "prompt": "What is the capital of France?",
+                "response": '{"content": "Paris is the capital of France."}',
+                "vector_distance": 0.1  # Distance of 0.1 means similarity of 0.9
+            }
+        ]
+        
+        redis_semantic_cache.llmcache.acheck = AsyncMock(return_value=mock_result)
+        redis_semantic_cache._get_async_embedding = AsyncMock(return_value=[0.1, 0.2, 0.3])
+        
+        # Test async_get_cache with a message
+        result = await redis_semantic_cache.async_get_cache(
+            key="test_key",
+            messages=[{"content": "What is the capital of France?"}],
+            metadata={}
+        )
+        
+        # Verify result is properly parsed
+        assert result == {"content": "Paris is the capital of France."}
+        
+        # Verify methods were called
+        redis_semantic_cache._get_async_embedding.assert_called_once()
+        redis_semantic_cache.llmcache.acheck.assert_called_once() 
--- a/tests/litellm/litellm_core_utils/llm_cost_calc/test_tool_call_cost_tracking.py
+++ b/tests/litellm/litellm_core_utils/llm_cost_calc/test_tool_call_cost_tracking.py
@ -0,0 +1,113 @@
+import json
+import os
+import sys
+
+import pytest
+from fastapi.testclient import TestClient
+
+import litellm
+from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
+    StandardBuiltInToolCostTracking,
+)
+from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
+from litellm.types.utils import ModelInfo, ModelResponse, StandardBuiltInToolsParams
+
+sys.path.insert(
+    0, os.path.abspath("../../..")
+)  # Adds the parent directory to the system path
+
+
+# Test basic web search cost calculations
+def test_web_search_cost_low():
+    web_search_options = WebSearchOptions(search_context_size="low")
+    model_info = litellm.get_model_info("gpt-4o-search-preview")
+
+    cost = StandardBuiltInToolCostTracking.get_cost_for_web_search(
+        web_search_options=web_search_options, model_info=model_info
+    )
+
+    assert (
+        cost == model_info["search_context_cost_per_query"]["search_context_size_low"]
+    )
+
+
+def test_web_search_cost_medium():
+    web_search_options = WebSearchOptions(search_context_size="medium")
+    model_info = litellm.get_model_info("gpt-4o-search-preview")
+
+    cost = StandardBuiltInToolCostTracking.get_cost_for_web_search(
+        web_search_options=web_search_options, model_info=model_info
+    )
+
+    assert (
+        cost
+        == model_info["search_context_cost_per_query"]["search_context_size_medium"]
+    )
+
+
+def test_web_search_cost_high():
+    web_search_options = WebSearchOptions(search_context_size="high")
+    model_info = litellm.get_model_info("gpt-4o-search-preview")
+
+    cost = StandardBuiltInToolCostTracking.get_cost_for_web_search(
+        web_search_options=web_search_options, model_info=model_info
+    )
+
+    assert (
+        cost == model_info["search_context_cost_per_query"]["search_context_size_high"]
+    )
+
+
+# Test file search cost calculation
+def test_file_search_cost():
+    file_search = FileSearchTool(type="file_search")
+    cost = StandardBuiltInToolCostTracking.get_cost_for_file_search(
+        file_search=file_search
+    )
+    assert cost == 0.0025  # $2.50/1000 calls = 0.0025 per call
+
+
+# Test edge cases
+def test_none_inputs():
+    # Test with None inputs
+    assert (
+        StandardBuiltInToolCostTracking.get_cost_for_web_search(
+            web_search_options=None, model_info=None
+        )
+        == 0.0
+    )
+    assert (
+        StandardBuiltInToolCostTracking.get_cost_for_file_search(file_search=None)
+        == 0.0
+    )
+
+
+# Test the main get_cost_for_built_in_tools method
+def test_get_cost_for_built_in_tools_web_search():
+    model = "gpt-4"
+    standard_built_in_tools_params = StandardBuiltInToolsParams(
+        web_search_options=WebSearchOptions(search_context_size="medium")
+    )
+
+    cost = StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
+        model=model,
+        response_object=None,
+        standard_built_in_tools_params=standard_built_in_tools_params,
+    )
+
+    assert isinstance(cost, float)
+
+
+def test_get_cost_for_built_in_tools_file_search():
+    model = "gpt-4"
+    standard_built_in_tools_params = StandardBuiltInToolsParams(
+        file_search=FileSearchTool(type="file_search")
+    )
+
+    cost = StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
+        model=model,
+        response_object=None,
+        standard_built_in_tools_params=standard_built_in_tools_params,
+    )
+
+    assert cost == 0.0025
--- a/tests/litellm/litellm_core_utils/test_litellm_logging.py
+++ b/tests/litellm/litellm_core_utils/test_litellm_logging.py
@ -0,0 +1,34 @@
+import json
+import os
+import sys
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+sys.path.insert(
+    0, os.path.abspath("../../..")
+)  # Adds the parent directory to the system path
+
+import time
+
+from litellm.litellm_core_utils.litellm_logging import Logging as LitellmLogging
+
+
+@pytest.fixture
+def logging_obj():
+    return LitellmLogging(
+        model="bedrock/claude-3-5-sonnet-20240620-v1:0",
+        messages=[{"role": "user", "content": "Hey"}],
+        stream=True,
+        call_type="completion",
+        start_time=time.time(),
+        litellm_call_id="12345",
+        function_id="1245",
+    )
+
+
+def test_get_masked_api_base(logging_obj):
+    api_base = "https://api.openai.com/v1"
+    masked_api_base = logging_obj._get_masked_api_base(api_base)
+    assert masked_api_base == "https://api.openai.com/v1"
+    assert type(masked_api_base) == str
--- a/tests/litellm/proxy/spend_tracking/test_spend_management_endpoints.py
+++ b/tests/litellm/proxy/spend_tracking/test_spend_management_endpoints.py
@ -1,3 +1,4 @@
+import asyncio
 import datetime
 import json
 import os
@ -11,7 +12,13 @@ sys.path.insert(
    0, os.path.abspath("../../../..")
 )  # Adds the parent directory to the system path

+from unittest.mock import MagicMock, patch
+
+import litellm
+from litellm.proxy._types import SpendLogsPayload
+from litellm.proxy.hooks.proxy_track_cost_callback import _ProxyDBLogger
 from litellm.proxy.proxy_server import app, prisma_client
+from litellm.router import Router


@pytest.fixture
@ -400,3 +407,270 @@ async def test_ui_view_spend_logs_unauthorized(client):
        headers={"Authorization": "Bearer invalid-token"},
    )
    assert response.status_code == 401 or response.status_code == 403
+
+
+class TestSpendLogsPayload:
+    @pytest.mark.asyncio
+    async def test_spend_logs_payload_e2e(self):
+        litellm.callbacks = [_ProxyDBLogger(message_logging=False)]
+        # litellm._turn_on_debug()
+
+        with patch.object(
+            litellm.proxy.proxy_server, "_set_spend_logs_payload"
+        ) as mock_client, patch.object(litellm.proxy.proxy_server, "prisma_client"):
+            response = await litellm.acompletion(
+                model="gpt-4o",
+                messages=[{"role": "user", "content": "Hello, world!"}],
+                mock_response="Hello, world!",
+                metadata={"user_api_key_end_user_id": "test_user_1"},
+            )
+
+            assert response.choices[0].message.content == "Hello, world!"
+
+            await asyncio.sleep(1)
+
+            mock_client.assert_called_once()
+
+            kwargs = mock_client.call_args.kwargs
+            payload: SpendLogsPayload = kwargs["payload"]
+            expected_payload = SpendLogsPayload(
+                **{
+                    "request_id": "chatcmpl-34df56d5-4807-45c1-bb99-61e52586b802",
+                    "call_type": "acompletion",
+                    "api_key": "",
+                    "cache_hit": "None",
+                    "startTime": datetime.datetime(
+                        2025, 3, 24, 22, 2, 42, 975883, tzinfo=datetime.timezone.utc
+                    ),
+                    "endTime": datetime.datetime(
+                        2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
+                    ),
+                    "completionStartTime": datetime.datetime(
+                        2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
+                    ),
+                    "model": "gpt-4o",
+                    "user": "",
+                    "team_id": "",
+                    "metadata": '{"applied_guardrails": [], "batch_models": null, "additional_usage_values": {"completion_tokens_details": null, "prompt_tokens_details": null}}',
+                    "cache_key": "Cache OFF",
+                    "spend": 0.00022500000000000002,
+                    "total_tokens": 30,
+                    "prompt_tokens": 10,
+                    "completion_tokens": 20,
+                    "request_tags": "[]",
+                    "end_user": "test_user_1",
+                    "api_base": "",
+                    "model_group": "",
+                    "model_id": "",
+                    "requester_ip_address": None,
+                    "custom_llm_provider": "openai",
+                    "messages": "{}",
+                    "response": "{}",
+                }
+            )
+
+            for key, value in expected_payload.items():
+                if key in [
+                    "request_id",
+                    "startTime",
+                    "endTime",
+                    "completionStartTime",
+                    "endTime",
+                ]:
+                    assert payload[key] is not None
+                else:
+                    assert (
+                        payload[key] == value
+                    ), f"Expected {key} to be {value}, but got {payload[key]}"
+
+    def mock_anthropic_response(*args, **kwargs):
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.headers = {"Content-Type": "application/json"}
+        mock_response.json.return_value = {
+            "content": [{"text": "Hi! My name is Claude.", "type": "text"}],
+            "id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
+            "model": "claude-3-7-sonnet-20250219",
+            "role": "assistant",
+            "stop_reason": "end_turn",
+            "stop_sequence": None,
+            "type": "message",
+            "usage": {"input_tokens": 2095, "output_tokens": 503},
+        }
+        return mock_response
+
+    @pytest.mark.asyncio
+    async def test_spend_logs_payload_success_log_with_api_base(self):
+        from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
+
+        litellm.callbacks = [_ProxyDBLogger(message_logging=False)]
+        # litellm._turn_on_debug()
+
+        client = AsyncHTTPHandler()
+
+        with patch.object(
+            litellm.proxy.proxy_server, "_set_spend_logs_payload"
+        ) as mock_client, patch.object(
+            litellm.proxy.proxy_server, "prisma_client"
+        ), patch.object(
+            client, "post", side_effect=self.mock_anthropic_response
+        ):
+            response = await litellm.acompletion(
+                model="claude-3-7-sonnet-20250219",
+                messages=[{"role": "user", "content": "Hello, world!"}],
+                metadata={"user_api_key_end_user_id": "test_user_1"},
+                client=client,
+            )
+
+            assert response.choices[0].message.content == "Hi! My name is Claude."
+
+            await asyncio.sleep(1)
+
+            mock_client.assert_called_once()
+
+            kwargs = mock_client.call_args.kwargs
+            payload: SpendLogsPayload = kwargs["payload"]
+            expected_payload = SpendLogsPayload(
+                **{
+                    "request_id": "chatcmpl-34df56d5-4807-45c1-bb99-61e52586b802",
+                    "call_type": "acompletion",
+                    "api_key": "",
+                    "cache_hit": "None",
+                    "startTime": datetime.datetime(
+                        2025, 3, 24, 22, 2, 42, 975883, tzinfo=datetime.timezone.utc
+                    ),
+                    "endTime": datetime.datetime(
+                        2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
+                    ),
+                    "completionStartTime": datetime.datetime(
+                        2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
+                    ),
+                    "model": "claude-3-7-sonnet-20250219",
+                    "user": "",
+                    "team_id": "",
+                    "metadata": '{"applied_guardrails": [], "batch_models": null, "additional_usage_values": {"completion_tokens_details": null, "prompt_tokens_details": {"audio_tokens": null, "cached_tokens": 0, "text_tokens": null, "image_tokens": null}, "cache_creation_input_tokens": 0, "cache_read_input_tokens": 0}}',
+                    "cache_key": "Cache OFF",
+                    "spend": 0.01383,
+                    "total_tokens": 2598,
+                    "prompt_tokens": 2095,
+                    "completion_tokens": 503,
+                    "request_tags": "[]",
+                    "end_user": "test_user_1",
+                    "api_base": "https://api.anthropic.com/v1/messages",
+                    "model_group": "",
+                    "model_id": "",
+                    "requester_ip_address": None,
+                    "custom_llm_provider": "anthropic",
+                    "messages": "{}",
+                    "response": "{}",
+                }
+            )
+
+            for key, value in expected_payload.items():
+                if key in [
+                    "request_id",
+                    "startTime",
+                    "endTime",
+                    "completionStartTime",
+                    "endTime",
+                ]:
+                    assert payload[key] is not None
+                else:
+                    assert (
+                        payload[key] == value
+                    ), f"Expected {key} to be {value}, but got {payload[key]}"
+
+    @pytest.mark.asyncio
+    async def test_spend_logs_payload_success_log_with_router(self):
+        from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
+
+        litellm.callbacks = [_ProxyDBLogger(message_logging=False)]
+        # litellm._turn_on_debug()
+
+        client = AsyncHTTPHandler()
+
+        router = Router(
+            model_list=[
+                {
+                    "model_name": "my-anthropic-model-group",
+                    "litellm_params": {
+                        "model": "claude-3-7-sonnet-20250219",
+                    },
+                    "model_info": {
+                        "id": "my-unique-model-id",
+                    },
+                }
+            ]
+        )
+
+        with patch.object(
+            litellm.proxy.proxy_server, "_set_spend_logs_payload"
+        ) as mock_client, patch.object(
+            litellm.proxy.proxy_server, "prisma_client"
+        ), patch.object(
+            client, "post", side_effect=self.mock_anthropic_response
+        ):
+            response = await router.acompletion(
+                model="my-anthropic-model-group",
+                messages=[{"role": "user", "content": "Hello, world!"}],
+                metadata={"user_api_key_end_user_id": "test_user_1"},
+                client=client,
+            )
+
+            assert response.choices[0].message.content == "Hi! My name is Claude."
+
+            await asyncio.sleep(1)
+
+            mock_client.assert_called_once()
+
+            kwargs = mock_client.call_args.kwargs
+            payload: SpendLogsPayload = kwargs["payload"]
+            expected_payload = SpendLogsPayload(
+                **{
+                    "request_id": "chatcmpl-34df56d5-4807-45c1-bb99-61e52586b802",
+                    "call_type": "acompletion",
+                    "api_key": "",
+                    "cache_hit": "None",
+                    "startTime": datetime.datetime(
+                        2025, 3, 24, 22, 2, 42, 975883, tzinfo=datetime.timezone.utc
+                    ),
+                    "endTime": datetime.datetime(
+                        2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
+                    ),
+                    "completionStartTime": datetime.datetime(
+                        2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
+                    ),
+                    "model": "claude-3-7-sonnet-20250219",
+                    "user": "",
+                    "team_id": "",
+                    "metadata": '{"applied_guardrails": [], "batch_models": null, "additional_usage_values": {"completion_tokens_details": null, "prompt_tokens_details": {"audio_tokens": null, "cached_tokens": 0, "text_tokens": null, "image_tokens": null}, "cache_creation_input_tokens": 0, "cache_read_input_tokens": 0}}',
+                    "cache_key": "Cache OFF",
+                    "spend": 0.01383,
+                    "total_tokens": 2598,
+                    "prompt_tokens": 2095,
+                    "completion_tokens": 503,
+                    "request_tags": "[]",
+                    "end_user": "test_user_1",
+                    "api_base": "https://api.anthropic.com/v1/messages",
+                    "model_group": "my-anthropic-model-group",
+                    "model_id": "my-unique-model-id",
+                    "requester_ip_address": None,
+                    "custom_llm_provider": "anthropic",
+                    "messages": "{}",
+                    "response": "{}",
+                }
+            )
+
+            for key, value in expected_payload.items():
+                if key in [
+                    "request_id",
+                    "startTime",
+                    "endTime",
+                    "completionStartTime",
+                    "endTime",
+                ]:
+                    assert payload[key] is not None
+                else:
+                    assert (
+                        payload[key] == value
+                    ), f"Expected {key} to be {value}, but got {payload[key]}"
--- a/tests/litellm_utils_tests/test_utils.py
+++ b/tests/litellm_utils_tests/test_utils.py
@ -477,6 +477,25 @@ def test_supports_function_calling(model, expected_bool):
        pytest.fail(f"Error occurred: {e}")


+@pytest.mark.parametrize(
+    "model, expected_bool",
+    [
+        ("gpt-4o-mini-search-preview", True),
+        ("openai/gpt-4o-mini-search-preview", True),
+        ("gpt-4o-search-preview", True),
+        ("openai/gpt-4o-search-preview", True),
+        ("groq/deepseek-r1-distill-llama-70b", False),
+        ("groq/llama-3.3-70b-versatile", False),
+        ("codestral/codestral-latest", False),
+    ],
+)
+def test_supports_web_search(model, expected_bool):
+    try:
+        assert litellm.supports_web_search(model=model) == expected_bool
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 def test_get_max_token_unit_test():
    """
    More complete testing in `test_completion_cost.py`
--- a/tests/local_testing/test_caching.py
+++ b/tests/local_testing/test_caching.py
@ -794,7 +794,7 @@ def test_redis_cache_completion():
    response3 = completion(
        model="gpt-3.5-turbo", messages=messages, caching=True, temperature=0.5
    )
-    response4 = completion(model="azure/chatgpt-v-2", messages=messages, caching=True)
+    response4 = completion(model="gpt-4o-mini", messages=messages, caching=True)

    print("\nresponse 1", response1)
    print("\nresponse 2", response2)
@ -1690,20 +1690,12 @@ def test_cache_context_managers():
    print("VARS of litellm.cache", vars(litellm.cache))


-# test_cache_context_managers()
-
-
-@pytest.mark.skip(reason="beta test - new redis semantic cache")
 def test_redis_semantic_cache_completion():
    litellm.set_verbose = True
    import logging

    logging.basicConfig(level=logging.DEBUG)

-    random_number = random.randint(
-        1, 100000
-    )  # add a random number to ensure it's always adding /reading from cache
-
    print("testing semantic caching")
    litellm.cache = Cache(
        type="redis-semantic",
@ -1718,33 +1710,30 @@ def test_redis_semantic_cache_completion():
        messages=[
            {
                "role": "user",
-                "content": f"write a one sentence poem about: {random_number}",
+                "content": "write a one sentence poem about summer",
            }
        ],
        max_tokens=20,
    )
    print(f"response1: {response1}")

-    random_number = random.randint(1, 100000)
-
    response2 = completion(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
-                "content": f"write a one sentence poem about: {random_number}",
+                "content": "write a one sentence poem about summertime",
            }
        ],
        max_tokens=20,
    )
-    print(f"response2: {response1}")
+    print(f"response2: {response2}")
    assert response1.id == response2.id


 # test_redis_cache_completion()


-@pytest.mark.skip(reason="beta test - new redis semantic cache")
@pytest.mark.asyncio
 async def test_redis_semantic_cache_acompletion():
    litellm.set_verbose = True
@ -1752,38 +1741,32 @@ async def test_redis_semantic_cache_acompletion():

    logging.basicConfig(level=logging.DEBUG)

-    random_number = random.randint(
-        1, 100000
-    )  # add a random number to ensure it's always adding / reading from cache
-
    print("testing semantic caching")
    litellm.cache = Cache(
        type="redis-semantic",
        host=os.environ["REDIS_HOST"],
        port=os.environ["REDIS_PORT"],
        password=os.environ["REDIS_PASSWORD"],
-        similarity_threshold=0.8,
-        redis_semantic_cache_use_async=True,
+        similarity_threshold=0.7,
    )
    response1 = await litellm.acompletion(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
-                "content": f"write a one sentence poem about: {random_number}",
+                "content": "write a one sentence poem about summer",
            }
        ],
        max_tokens=5,
    )
    print(f"response1: {response1}")

-    random_number = random.randint(1, 100000)
    response2 = await litellm.acompletion(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
-                "content": f"write a one sentence poem about: {random_number}",
+                "content": "write a one sentence poem about summertime",
            }
        ],
        max_tokens=5,
--- a/tests/logging_callback_tests/gcs_pub_sub_body/standard_logging_payload.json
+++ b/tests/logging_callback_tests/gcs_pub_sub_body/standard_logging_payload.json
@ -0,0 +1,175 @@
+{
+    "id": "chatcmpl-2299b6a2-82a3-465a-b47c-04e685a2227f",
+    "trace_id": null,
+    "call_type": "acompletion",
+    "cache_hit": null,
+    "stream": true,
+    "status": "success",
+    "custom_llm_provider": "openai",
+    "saved_cache_cost": 0.0,
+    "startTime": "2025-01-24 09:20:46.847371",
+    "endTime": "2025-01-24 09:20:46.851954",
+    "completionStartTime": "2025-01-24 09:20:46.851954",
+    "response_time": 0.007394075393676758,
+    "model": "gpt-4o",
+    "metadata": {
+      "user_api_key_hash": null,
+      "user_api_key_alias": null,
+      "user_api_key_team_id": null,
+      "user_api_key_org_id": null,
+      "user_api_key_user_id": null,
+      "user_api_key_team_alias": null,
+      "user_api_key_user_email": null,
+      "spend_logs_metadata": null,
+      "requester_ip_address": null,
+      "requester_metadata": null,
+      "user_api_key_end_user_id": null,
+      "prompt_management_metadata": null,
+      "applied_guardrails": []
+    },
+    "cache_key": null,
+    "response_cost": 0.00022500000000000002,
+    "total_tokens": 30,
+    "prompt_tokens": 10,
+    "completion_tokens": 20,
+    "request_tags": [],
+    "end_user": "",
+    "api_base": "",
+    "model_group": "",
+    "model_id": "",
+    "requester_ip_address": null,
+    "messages": [
+      {
+        "role": "user",
+        "content": "Hello, world!"
+      }
+    ],
+    "response": {
+      "id": "chatcmpl-2299b6a2-82a3-465a-b47c-04e685a2227f",
+      "created": 1742855151,
+      "model": "gpt-4o",
+      "object": "chat.completion",
+      "system_fingerprint": null,
+      "choices": [
+        {
+          "finish_reason": "stop",
+          "index": 0,
+          "message": {
+            "content": "hi",
+            "role": "assistant",
+            "tool_calls": null,
+            "function_call": null
+          }
+        }
+      ],
+      "usage": {
+        "completion_tokens": 20,
+        "prompt_tokens": 10,
+        "total_tokens": 30,
+        "completion_tokens_details": null,
+        "prompt_tokens_details": null
+      }
+    },
+    "model_parameters": {},
+    "hidden_params": {
+      "model_id": null,
+      "cache_key": null,
+      "api_base": "https://api.openai.com",
+      "response_cost": 0.00022500000000000002,
+      "additional_headers": {},
+      "litellm_overhead_time_ms": null,
+      "batch_models": null,
+      "litellm_model_name": "gpt-4o"
+    },
+    "model_map_information": {
+      "model_map_key": "gpt-4o",
+      "model_map_value": {
+        "key": "gpt-4o",
+        "max_tokens": 16384,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 16384,
+        "input_cost_per_token": 2.5e-06,
+        "cache_creation_input_token_cost": null,
+        "cache_read_input_token_cost": 1.25e-06,
+        "input_cost_per_character": null,
+        "input_cost_per_token_above_128k_tokens": null,
+        "input_cost_per_query": null,
+        "input_cost_per_second": null,
+        "input_cost_per_audio_token": null,
+        "input_cost_per_token_batches": 1.25e-06,
+        "output_cost_per_token_batches": 5e-06,
+        "output_cost_per_token": 1e-05,
+        "output_cost_per_audio_token": null,
+        "output_cost_per_character": null,
+        "output_cost_per_token_above_128k_tokens": null,
+        "output_cost_per_character_above_128k_tokens": null,
+        "output_cost_per_second": null,
+        "output_cost_per_image": null,
+        "output_vector_size": null,
+        "litellm_provider": "openai",
+        "mode": "chat",
+        "supports_system_messages": true,
+        "supports_response_schema": true,
+        "supports_vision": true,
+        "supports_function_calling": true,
+        "supports_tool_choice": true,
+        "supports_assistant_prefill": false,
+        "supports_prompt_caching": true,
+        "supports_audio_input": false,
+        "supports_audio_output": false,
+        "supports_pdf_input": false,
+        "supports_embedding_image_input": false,
+        "supports_native_streaming": null,
+        "supports_web_search": true,
+        "search_context_cost_per_query": {
+          "search_context_size_low": 0.03,
+          "search_context_size_medium": 0.035,
+          "search_context_size_high": 0.05
+        },
+        "tpm": null,
+        "rpm": null,
+        "supported_openai_params": [
+          "frequency_penalty",
+          "logit_bias",
+          "logprobs",
+          "top_logprobs",
+          "max_tokens",
+          "max_completion_tokens",
+          "modalities",
+          "prediction",
+          "n",
+          "presence_penalty",
+          "seed",
+          "stop",
+          "stream",
+          "stream_options",
+          "temperature",
+          "top_p",
+          "tools",
+          "tool_choice",
+          "function_call",
+          "functions",
+          "max_retries",
+          "extra_headers",
+          "parallel_tool_calls",
+          "audio",
+          "response_format",
+          "user"
+        ]
+      }
+    },
+    "error_str": null,
+    "error_information": {
+      "error_code": "",
+      "error_class": "",
+      "llm_provider": "",
+      "traceback": "",
+      "error_message": ""
+    },
+    "response_cost_failure_debug_info": null,
+    "guardrail_information": null,
+    "standard_built_in_tools_params": {
+      "web_search_options": null,
+      "file_search": null
+    }
+  }
--- a/tests/logging_callback_tests/test_built_in_tools_cost_tracking.py
+++ b/tests/logging_callback_tests/test_built_in_tools_cost_tracking.py
@ -0,0 +1,151 @@
+import os
+import sys
+import traceback
+import uuid
+import pytest
+from dotenv import load_dotenv
+from fastapi import Request
+from fastapi.routing import APIRoute
+
+load_dotenv()
+import io
+import os
+import time
+import json
+
+# this file is to test litellm/proxy
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import litellm
+import asyncio
+from typing import Optional
+from litellm.types.utils import StandardLoggingPayload, Usage, ModelInfoBase
+from litellm.integrations.custom_logger import CustomLogger
+
+
+class TestCustomLogger(CustomLogger):
+    def __init__(self):
+        self.recorded_usage: Optional[Usage] = None
+        self.standard_logging_payload: Optional[StandardLoggingPayload] = None
+
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
+        standard_logging_payload = kwargs.get("standard_logging_object")
+        self.standard_logging_payload = standard_logging_payload
+        print(
+            "standard_logging_payload",
+            json.dumps(standard_logging_payload, indent=4, default=str),
+        )
+
+        self.recorded_usage = Usage(
+            prompt_tokens=standard_logging_payload.get("prompt_tokens"),
+            completion_tokens=standard_logging_payload.get("completion_tokens"),
+            total_tokens=standard_logging_payload.get("total_tokens"),
+        )
+        pass
+
+
+async def _setup_web_search_test():
+    """Helper function to setup common test requirements"""
+    litellm._turn_on_debug()
+    test_custom_logger = TestCustomLogger()
+    litellm.callbacks = [test_custom_logger]
+    return test_custom_logger
+
+
+async def _verify_web_search_cost(test_custom_logger, expected_context_size):
+    """Helper function to verify web search costs"""
+    await asyncio.sleep(1)
+
+    standard_logging_payload = test_custom_logger.standard_logging_payload
+    response_cost = standard_logging_payload.get("response_cost")
+    assert response_cost is not None
+
+    # Calculate token cost
+    model_map_information = standard_logging_payload["model_map_information"]
+    model_map_value: ModelInfoBase = model_map_information["model_map_value"]
+    total_token_cost = (
+        standard_logging_payload["prompt_tokens"]
+        * model_map_value["input_cost_per_token"]
+    ) + (
+        standard_logging_payload["completion_tokens"]
+        * model_map_value["output_cost_per_token"]
+    )
+
+    # Verify total cost
+    assert (
+        response_cost
+        == total_token_cost
+        + model_map_value["search_context_cost_per_query"][expected_context_size]
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "web_search_options,expected_context_size",
+    [
+        (None, "search_context_size_medium"),
+        ({"search_context_size": "low"}, "search_context_size_low"),
+        ({"search_context_size": "high"}, "search_context_size_high"),
+    ],
+)
+async def test_openai_web_search_logging_cost_tracking(
+    web_search_options, expected_context_size
+):
+    """Test web search cost tracking with different search context sizes"""
+    test_custom_logger = await _setup_web_search_test()
+
+    request_kwargs = {
+        "model": "openai/gpt-4o-search-preview",
+        "messages": [
+            {"role": "user", "content": "What was a positive news story from today?"}
+        ],
+    }
+    if web_search_options is not None:
+        request_kwargs["web_search_options"] = web_search_options
+
+    response = await litellm.acompletion(**request_kwargs)
+
+    await _verify_web_search_cost(test_custom_logger, expected_context_size)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "tools_config,expected_context_size,stream",
+    [
+        (
+            [{"type": "web_search_preview", "search_context_size": "low"}],
+            "search_context_size_low",
+            True,
+        ),
+        (
+            [{"type": "web_search_preview", "search_context_size": "low"}],
+            "search_context_size_low",
+            False,
+        ),
+        ([{"type": "web_search_preview"}], "search_context_size_medium", True),
+        ([{"type": "web_search_preview"}], "search_context_size_medium", False),
+    ],
+)
+async def test_openai_responses_api_web_search_cost_tracking(
+    tools_config, expected_context_size, stream
+):
+    """Test web search cost tracking with different search context sizes and streaming options"""
+    test_custom_logger = await _setup_web_search_test()
+
+    response = await litellm.aresponses(
+        model="openai/gpt-4o",
+        input=[
+            {"role": "user", "content": "What was a positive news story from today?"}
+        ],
+        tools=tools_config,
+        stream=stream,
+    )
+    if stream is True:
+        async for chunk in response:
+            print("chunk", chunk)
+    else:
+        print("response", response)
+
+    await _verify_web_search_cost(test_custom_logger, expected_context_size)
--- a/tests/logging_callback_tests/test_gcs_pub_sub.py
+++ b/tests/logging_callback_tests/test_gcs_pub_sub.py
@ -6,6 +6,7 @@ import sys
 sys.path.insert(0, os.path.abspath("../.."))

 import asyncio
+import litellm
 import gzip
 import json
 import logging
@ -48,8 +49,15 @@ def assert_gcs_pubsub_request_matches_expected(
        expected_request_body = json.load(f)

    # Replace dynamic values in actual request body
-    time_fields = ["startTime", "endTime", "completionStartTime", "request_id"]
-    for field in time_fields:
+    dynamic_fields = [
+        "startTime",
+        "endTime",
+        "completionStartTime",
+        "request_id",
+        "id",
+        "response_time",
+    ]
+    for field in dynamic_fields:
        if field in actual_request_body:
            actual_request_body[field] = expected_request_body[field]

@ -59,6 +67,55 @@ def assert_gcs_pubsub_request_matches_expected(
    ), f"Difference in request bodies: {json.dumps(actual_request_body, indent=2)} != {json.dumps(expected_request_body, indent=2)}"


+def assert_gcs_pubsub_request_matches_expected_standard_logging_payload(
+    actual_request_body: dict,
+    expected_file_name: str,
+):
+    """
+    Helper function to compare actual GCS PubSub request body with expected JSON file.
+
+    Args:
+        actual_request_body (dict): The actual request body received from the API call
+        expected_file_name (str): Name of the JSON file containing expected request body
+    """
+    # Get the current directory and read the expected request body
+    pwd = os.path.dirname(os.path.realpath(__file__))
+    expected_body_path = os.path.join(pwd, "gcs_pub_sub_body", expected_file_name)
+
+    with open(expected_body_path, "r") as f:
+        expected_request_body = json.load(f)
+
+    # Replace dynamic values in actual request body
+    FIELDS_TO_VALIDATE = [
+        "custom_llm_provider",
+        "hidden_params",
+        "messages",
+        "response",
+        "model",
+        "status",
+        "stream",
+    ]
+
+    actual_request_body["response"]["id"] = expected_request_body["response"]["id"]
+    actual_request_body["response"]["created"] = expected_request_body["response"][
+        "created"
+    ]
+
+    for field in FIELDS_TO_VALIDATE:
+        assert field in actual_request_body
+
+    FIELDS_EXISTENCE_CHECKS = [
+        "response_cost",
+        "response_time",
+        "completion_tokens",
+        "prompt_tokens",
+        "total_tokens",
+    ]
+
+    for field in FIELDS_EXISTENCE_CHECKS:
+        assert field in actual_request_body
+
+
@pytest.mark.asyncio
 async def test_async_gcs_pub_sub():
    # Create a mock for the async_httpx_client's post method
@ -102,6 +159,61 @@ async def test_async_gcs_pub_sub():

    decoded_message = base64.b64decode(encoded_message).decode("utf-8")

+    # Parse the JSON string into a dictionary
+    actual_request = json.loads(decoded_message)
+    print("##########\n")
+    print(json.dumps(actual_request, indent=4))
+    print("##########\n")
+    # Verify the request body matches expected format
+    assert_gcs_pubsub_request_matches_expected_standard_logging_payload(
+        actual_request, "standard_logging_payload.json"
+    )
+
+
+@pytest.mark.asyncio
+async def test_async_gcs_pub_sub_v1():
+    # Create a mock for the async_httpx_client's post method
+    litellm.gcs_pub_sub_use_v1 = True
+    mock_post = AsyncMock()
+    mock_post.return_value.status_code = 202
+    mock_post.return_value.text = "Accepted"
+
+    # Initialize the GcsPubSubLogger and set the mock
+    gcs_pub_sub_logger = GcsPubSubLogger(flush_interval=1)
+    gcs_pub_sub_logger.async_httpx_client.post = mock_post
+
+    mock_construct_request_headers = AsyncMock()
+    mock_construct_request_headers.return_value = {"Authorization": "Bearer mock_token"}
+    gcs_pub_sub_logger.construct_request_headers = mock_construct_request_headers
+    litellm.callbacks = [gcs_pub_sub_logger]
+
+    # Make the completion call
+    response = await litellm.acompletion(
+        model="gpt-4o",
+        messages=[{"role": "user", "content": "Hello, world!"}],
+        mock_response="hi",
+    )
+
+    await asyncio.sleep(3)  # Wait for async flush
+
+    # Assert httpx post was called
+    mock_post.assert_called_once()
+
+    # Get the actual request body from the mock
+    actual_url = mock_post.call_args[1]["url"]
+    print("sent to url", actual_url)
+    assert (
+        actual_url
+        == "https://pubsub.googleapis.com/v1/projects/reliableKeys/topics/litellmDB:publish"
+    )
+    actual_request = mock_post.call_args[1]["json"]
+
+    # Extract and decode the base64 encoded message
+    encoded_message = actual_request["messages"][0]["data"]
+    import base64
+
+    decoded_message = base64.b64decode(encoded_message).decode("utf-8")
+
    # Parse the JSON string into a dictionary
    actual_request = json.loads(decoded_message)
    print("##########\n")
--- a/tests/logging_callback_tests/test_token_counting.py
+++ b/tests/logging_callback_tests/test_token_counting.py
@ -21,16 +21,18 @@ sys.path.insert(
 import litellm
 import asyncio
 from typing import Optional
-from litellm.types.utils import StandardLoggingPayload, Usage
+from litellm.types.utils import StandardLoggingPayload, Usage, ModelInfoBase
 from litellm.integrations.custom_logger import CustomLogger


 class TestCustomLogger(CustomLogger):
    def __init__(self):
        self.recorded_usage: Optional[Usage] = None
+        self.standard_logging_payload: Optional[StandardLoggingPayload] = None

    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        standard_logging_payload = kwargs.get("standard_logging_object")
+        self.standard_logging_payload = standard_logging_payload
        print(
            "standard_logging_payload",
            json.dumps(standard_logging_payload, indent=4, default=str),