From bb2c57bbdd6f54d1c41235c76f17c0b0a97cd70f Mon Sep 17 00:00:00 2001
From: Krish Dholakia <krrishdholakia@gmail.com>
Date: Sat, 1 Mar 2025 10:45:40 -0800
Subject: [PATCH] Litellm stable release notes v1 61 20 (#8929)

* docs(index.md): add initial release notes

* docs(infinity.md): update docs with supported cohere rerank params

* style: cleanup

* docs(vllm.md): add doc on sending video to vllm

* docs(index.md): add vllm video logic to release notes

* docs(reasoning_content.md): cleanup docs
---
 docs/my-website/docs/providers/bedrock.md     |  17 +-
 docs/my-website/docs/providers/infinity.md    | 117 ++++++++
 docs/my-website/docs/providers/vertex.md      | 114 ++++++++
 docs/my-website/docs/providers/vllm.md        |  92 +++++++
 docs/my-website/docs/reasoning_content.md     | 252 +++++++++++++++++-
 .../release_notes/v1.57.8-stable/index.md     |   7 -
 .../release_notes/v1.61.20-stable/index.md    |  97 +++++++
 7 files changed, 684 insertions(+), 12 deletions(-)
 create mode 100644 docs/my-website/release_notes/v1.61.20-stable/index.md
diff --git a/docs/my-website/docs/providers/bedrock.md b/docs/my-website/docs/providers/bedrock.md
index 816c260770..d765da1e51 100644
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@@ -379,11 +379,20 @@ print(f"\nResponse: {resp}")
 
 ## Usage - 'thinking' / 'reasoning content'
 
-This is currently only supported for Anthropic's Claude 3.7 Sonnet.
+This is currently only supported for Anthropic's Claude 3.7 Sonnet + Deepseek R1.
 
-Works for:
-- sync completion calls (SDK) - v1.61.19+
-- async completion calls (SDK + PROXY) - v1.61.20+
+Works on v1.61.20+.
+
+Returns 2 new fields in `message` and `delta` object:
+- `reasoning_content` - string - The reasoning content of the response
+- `thinking_blocks` - list of objects (Anthropic only) - The thinking blocks of the response
+
+Each object has the following fields:
+- `type` - Literal["thinking"] - The type of thinking block
+- `thinking` - string - The thinking of the response. Also returned in `reasoning_content`
+- `signature_delta` - string - A base64 encoded string, returned by Anthropic.
+
+The `signature_delta` is required by Anthropic on subsequent calls, if 'thinking' content is passed in (only required to use `thinking` with tool calling). [Learn more](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks)
 
 <Tabs>
 <TabItem value="sdk" label="SDK">
diff --git a/docs/my-website/docs/providers/infinity.md b/docs/my-website/docs/providers/infinity.md
index dd6986dfef..091503bf18 100644
--- a/docs/my-website/docs/providers/infinity.md
+++ b/docs/my-website/docs/providers/infinity.md
@@ -1,3 +1,6 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Infinity
 
 | Property | Details |
@@ -12,6 +15,9 @@
 
 ```python
 from litellm import rerank
+import os
+
+os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
 
 response = rerank(
     model="infinity/rerank",
@@ -65,3 +71,114 @@ curl http://0.0.0.0:4000/rerank \
 ```
 
 
+## Supported Cohere Rerank API Params
+
+| Param | Type | Description |
+|-------|-------|-------|
+| `query` | `str` | The query to rerank the documents against |
+| `documents` | `list[str]` | The documents to rerank |
+| `top_n` | `int` | The number of documents to return |
+| `return_documents` | `bool` | Whether to return the documents in the response |
+
+### Usage - Return Documents
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+response = rerank(
+    model="infinity/rerank",
+    query="What is the capital of France?",
+    documents=["Paris", "London", "Berlin", "Madrid"],
+    return_documents=True,
+)
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/rerank \
+  -H "Authorization: Bearer sk-1234" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "custom-infinity-rerank",
+    "query": "What is the capital of France?",
+    "documents": [
+        "Paris",
+        "London",
+        "Berlin",
+        "Madrid"
+    ],
+    "return_documents": True,
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+## Pass Provider-specific Params
+
+Any unmapped params will be passed to the provider as-is.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import rerank
+import os
+
+os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
+
+response = rerank(
+    model="infinity/rerank",
+    query="What is the capital of France?",
+    documents=["Paris", "London", "Berlin", "Madrid"],
+    raw_scores=True, # 👈 PROVIDER-SPECIFIC PARAM
+)
+```
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: custom-infinity-rerank
+    litellm_params:
+      model: infinity/rerank
+      api_base: https://localhost:8080
+      raw_scores: True # 👈 EITHER SET PROVIDER-SPECIFIC PARAMS HERE OR IN REQUEST BODY
+```
+
+2. Start litellm
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it!  
+
+```bash
+curl http://0.0.0.0:4000/rerank \
+  -H "Authorization: Bearer sk-1234" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "custom-infinity-rerank",
+    "query": "What is the capital of the United States?",
+    "documents": [
+        "Carson City is the capital city of the American state of Nevada.",
+        "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
+        "Washington, D.C. is the capital of the United States.",
+        "Capital punishment has existed in the United States since before it was a country."
+    ],
+    "raw_scores": True # 👈 PROVIDER-SPECIFIC PARAM
+  }'
+```
+</TabItem>
+
+</Tabs>
diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md
index cb8c031c06..f2ed4650ac 100644
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@@ -852,6 +852,7 @@ litellm.vertex_location = "us-central1 # Your Location
 | claude-3-5-sonnet@20240620  | `completion('vertex_ai/claude-3-5-sonnet@20240620', messages)` |
 | claude-3-sonnet@20240229   | `completion('vertex_ai/claude-3-sonnet@20240229', messages)` |
 | claude-3-haiku@20240307   | `completion('vertex_ai/claude-3-haiku@20240307', messages)` |
+| claude-3-7-sonnet@20250219   | `completion('vertex_ai/claude-3-7-sonnet@20250219', messages)` |
 
 ### Usage
 
@@ -926,6 +927,119 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 </Tabs>
 
 
+
+### Usage - `thinking` / `reasoning_content`
+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+resp = completion(
+    model="vertex_ai/claude-3-7-sonnet-20250219",
+    messages=[{"role": "user", "content": "What is the capital of France?"}],
+    thinking={"type": "enabled", "budget_tokens": 1024},
+)
+
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+- model_name: claude-3-7-sonnet-20250219
+  litellm_params:
+    model: vertex_ai/claude-3-7-sonnet-20250219
+    vertex_ai_project: "my-test-project"
+    vertex_ai_location: "us-west-1"
+```
+
+2. Start proxy
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
+  -d '{
+    "model": "claude-3-7-sonnet-20250219",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "thinking": {"type": "enabled", "budget_tokens": 1024}
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
+**Expected Response**
+
+```python
+ModelResponse(
+    id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
+    created=1740470510,
+    model='claude-3-7-sonnet-20250219',
+    object='chat.completion',
+    system_fingerprint=None,
+    choices=[
+        Choices(
+            finish_reason='stop',
+            index=0,
+            message=Message(
+                content="The capital of France is Paris.",
+                role='assistant',
+                tool_calls=None,
+                function_call=None,
+                provider_specific_fields={
+                    'citations': None,
+                    'thinking_blocks': [
+                        {
+                            'type': 'thinking',
+                            'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
+                            'signature': 'EuYBCkQYAiJAy6...'
+                        }
+                    ]
+                }
+            ),
+            thinking_blocks=[
+                {
+                    'type': 'thinking',
+                    'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
+                    'signature': 'EuYBCkQYAiJAy6AGB...'
+                }
+            ],
+            reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
+        )
+    ],
+    usage=Usage(
+        completion_tokens=68,
+        prompt_tokens=42,
+        total_tokens=110,
+        completion_tokens_details=None,
+        prompt_tokens_details=PromptTokensDetailsWrapper(
+            audio_tokens=None,
+            cached_tokens=0,
+            text_tokens=None,
+            image_tokens=None
+        ),
+        cache_creation_input_tokens=0,
+        cache_read_input_tokens=0
+    )
+)
+```
+
+
+
 ## Llama 3 API
  
 | Model Name       | Function Call                        |
diff --git a/docs/my-website/docs/providers/vllm.md b/docs/my-website/docs/providers/vllm.md
index 9cc0ad487e..b5987167ec 100644
--- a/docs/my-website/docs/providers/vllm.md
+++ b/docs/my-website/docs/providers/vllm.md
@@ -157,6 +157,98 @@ curl -L -X POST 'http://0.0.0.0:4000/embeddings' \
 </TabItem>
 </Tabs>
 
+## Send Video URL to VLLM
+
+Example Implementation from VLLM [here](https://github.com/vllm-project/vllm/pull/10020)
+
+There are two ways to send a video url to VLLM:
+
+1. Pass the video url directly
+
+```
+{"type": "video_url", "video_url": {"url": video_url}},
+```
+
+2. Pass the video data as base64
+
+```
+{"type": "video_url", "video_url": {"url": f"data:video/mp4;base64,{video_data_base64}"}}
+```
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+response = completion(
+            model="hosted_vllm/qwen", # pass the vllm model name
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Summarize the following video"
+                        },
+                        {
+                            "type": "video_url",
+                            "video_url": {
+                                "url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
+                            }
+                        }
+                    ]
+                }
+            ],
+            api_base="https://hosted-vllm-api.co")
+
+print(response)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: my-model
+      litellm_params:
+        model: hosted_vllm/qwen  # add hosted_vllm/ prefix to route as OpenAI provider
+        api_base: https://hosted-vllm-api.co      # add api base for OpenAI compatible provider
+```
+
+2. Start the proxy 
+
+```bash
+$ litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it! 
+
+```bash
+curl -X POST http://0.0.0.0:4000/chat/completions \
+-H "Authorization: Bearer sk-1234" \
+-H "Content-Type: application/json" \
+-d '{
+    "model": "my-model",
+    "messages": [
+        {"role": "user", "content": 
+            [
+                {"type": "text", "text": "Summarize the following video"},
+                {"type": "video_url", "video_url": {"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"}}
+            ]
+        }
+    ]
+}'
+```
+
+</TabItem>
+</Tabs>
+
+
 ## (Deprecated) for `vllm pip package` 
 ### Using - `litellm.completion`
 
diff --git a/docs/my-website/docs/reasoning_content.md b/docs/my-website/docs/reasoning_content.md
index ce049e0127..92c3fae61f 100644
--- a/docs/my-website/docs/reasoning_content.md
+++ b/docs/my-website/docs/reasoning_content.md
@@ -6,7 +6,7 @@ import TabItem from '@theme/TabItem';
 Supported Providers:
 - Deepseek (`deepseek/`)
 - Anthropic API (`anthropic/`)
-- Bedrock (Anthropic) (`bedrock/`)
+- Bedrock (Anthropic + Deepseek) (`bedrock/`)
 - Vertex AI (Anthropic) (`vertexai/`)
 
 ```python
@@ -95,8 +95,258 @@ curl http://0.0.0.0:4000/v1/chat/completions \
 }
 ```
 
+## Tool Calling with `thinking`
+
+Here's how to use `thinking` blocks by Anthropic with tool calling.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+litellm._turn_on_debug()
+litellm.modify_params = True
+model = "anthropic/claude-3-7-sonnet-20250219" # works across Anthropic, Bedrock, Vertex AI
+# Step 1: send the conversation and available functions to the model
+messages = [
+    {
+        "role": "user",
+        "content": "What's the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses",
+    }
+]
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["location"],
+            },
+        },
+    }
+]
+response = litellm.completion(
+    model=model,
+    messages=messages,
+    tools=tools,
+    tool_choice="auto",  # auto is default, but we'll be explicit
+    thinking={"type": "enabled", "budget_tokens": 1024},
+)
+print("Response\n", response)
+response_message = response.choices[0].message
+tool_calls = response_message.tool_calls
+
+print("Expecting there to be 3 tool calls")
+assert (
+    len(tool_calls) > 0
+)  # this has to call the function for SF, Tokyo and paris
+
+# Step 2: check if the model wanted to call a function
+print(f"tool_calls: {tool_calls}")
+if tool_calls:
+    # Step 3: call the function
+    # Note: the JSON response may not always be valid; be sure to handle errors
+    available_functions = {
+        "get_current_weather": get_current_weather,
+    }  # only one function in this example, but you can have multiple
+    messages.append(
+        response_message
+    )  # extend conversation with assistant's reply
+    print("Response message\n", response_message)
+    # Step 4: send the info for each function call and function response to the model
+    for tool_call in tool_calls:
+        function_name = tool_call.function.name
+        if function_name not in available_functions:
+            # the model called a function that does not exist in available_functions - don't try calling anything
+            return
+        function_to_call = available_functions[function_name]
+        function_args = json.loads(tool_call.function.arguments)
+        function_response = function_to_call(
+            location=function_args.get("location"),
+            unit=function_args.get("unit"),
+        )
+        messages.append(
+            {
+                "tool_call_id": tool_call.id,
+                "role": "tool",
+                "name": function_name,
+                "content": function_response,
+            }
+        )  # extend conversation with function response
+    print(f"messages: {messages}")
+    second_response = litellm.completion(
+        model=model,
+        messages=messages,
+        seed=22,
+        # tools=tools,
+        drop_params=True,
+        thinking={"type": "enabled", "budget_tokens": 1024},
+    )  # get a new response from the model where it can see the function response
+    print("second response\n", second_response)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: claude-3-7-sonnet-thinking
+    litellm_params:
+      model: anthropic/claude-3-7-sonnet-20250219
+      api_key: os.environ/ANTHROPIC_API_KEY
+      thinking: {
+        "type": "enabled",
+        "budget_tokens": 1024
+      }
+```
+
+2. Run proxy
+
+```bash
+litellm --config config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Make 1st call
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "claude-3-7-sonnet-thinking",
+    "messages": [
+      {"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses"},
+    ],
+    "tools": [
+        {
+          "type": "function",
+          "function": {
+              "name": "get_current_weather",
+              "description": "Get the current weather in a given location",
+              "parameters": {
+                  "type": "object",
+                  "properties": {
+                      "location": {
+                          "type": "string",
+                          "description": "The city and state",
+                      },
+                      "unit": {
+                          "type": "string",
+                          "enum": ["celsius", "fahrenheit"],
+                      },
+                  },
+                  "required": ["location"],
+              },
+          },
+        }
+    ],
+    "tool_choice": "auto"
+  }'
+```
+
+4. Make 2nd call with tool call results
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "claude-3-7-sonnet-thinking",
+    "messages": [
+      {
+        "role": "user",
+        "content": "What\'s the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses"
+      },
+      {
+        "role": "assistant",
+        "content": "I\'ll check the current weather for these three cities for you:",
+        "tool_calls": [
+          {
+            "index": 2,
+            "function": {
+              "arguments": "{\"location\": \"San Francisco\"}",
+              "name": "get_current_weather"
+            },
+            "id": "tooluse_mnqzmtWYRjCxUInuAdK7-w",
+            "type": "function"
+          }
+        ],
+        "function_call": null,
+        "reasoning_content": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user.",
+        "thinking_blocks": [
+          {
+            "type": "thinking",
+            "thinking": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user.",
+            "signature_delta": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c="
+          }
+        ],
+        "provider_specific_fields": {
+          "reasoningContentBlocks": [
+            {
+              "reasoningText": {
+                "signature": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c=",
+                "text": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user."
+              }
+            }
+          ]
+        }
+      },
+      {
+        "tool_call_id": "tooluse_mnqzmtWYRjCxUInuAdK7-w",
+        "role": "tool",
+        "name": "get_current_weather",
+        "content": "{\"location\": \"San Francisco\", \"temperature\": \"72\", \"unit\": \"fahrenheit\"}"
+      }
+    ]
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+## Switching between Anthropic + Deepseek models 
+
+Set `drop_params=True` to drop the 'thinking' blocks when swapping from Anthropic to Deepseek models. Suggest improvements to this approach [here](https://github.com/BerriAI/litellm/discussions/8927).
+
+```python
+litellm.drop_params = True # 👈 EITHER GLOBALLY or per request
+
+# or per request
+## Anthropic
+response = litellm.completion(
+  model="anthropic/claude-3-7-sonnet-20250219",
+  messages=[{"role": "user", "content": "What is the capital of France?"}],
+  thinking={"type": "enabled", "budget_tokens": 1024},
+  drop_params=True,
+)
+
+## Deepseek
+response = litellm.completion(
+  model="deepseek/deepseek-chat",
+  messages=[{"role": "user", "content": "What is the capital of France?"}],
+  thinking={"type": "enabled", "budget_tokens": 1024},
+  drop_params=True,
+)
+```
+
 ## Spec 
 
+
 These fields can be accessed via `response.choices[0].message.reasoning_content` and `response.choices[0].message.thinking_blocks`.
 
 - `reasoning_content` - str: The reasoning content from the model. Returned across all providers.
diff --git a/docs/my-website/release_notes/v1.57.8-stable/index.md b/docs/my-website/release_notes/v1.57.8-stable/index.md
index 9787444fde..d37a7b9ff8 100644
--- a/docs/my-website/release_notes/v1.57.8-stable/index.md
+++ b/docs/my-website/release_notes/v1.57.8-stable/index.md
@@ -18,13 +18,6 @@ hide_table_of_contents: false
 `alerting`, `prometheus`, `secret management`, `management endpoints`, `ui`, `prompt management`, `finetuning`, `batch`
 
 
-:::note
-
-v1.57.8-stable, is currently being tested. It will be released on 2025-01-12. 
-
-:::
-
-
 ## New / Updated Models
 
 1. Mistral large pricing - https://github.com/BerriAI/litellm/pull/7452
diff --git a/docs/my-website/release_notes/v1.61.20-stable/index.md b/docs/my-website/release_notes/v1.61.20-stable/index.md
new file mode 100644
index 0000000000..57e4fa9484
--- /dev/null
+++ b/docs/my-website/release_notes/v1.61.20-stable/index.md
@@ -0,0 +1,97 @@
+---
+title: v1.61.20-stable
+slug: v1.61.20-stable
+date: 2025-03-02T10:00:00
+authors:
+  - name: Krrish Dholakia
+    title: CEO, LiteLLM
+    url: https://www.linkedin.com/in/krish-d/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
+  - name: Ishaan Jaffer
+    title: CTO, LiteLLM
+    url: https://www.linkedin.com/in/reffajnaahsi/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
+tags: [llm translation, rerank, ui, thinking, reasoning_content, claude-3-7-sonnet]
+hide_table_of_contents: false
+---
+
+import Image from '@theme/IdealImage';
+
+# v1.61.20-stable
+
+
+:::info
+
+`v1.61.20-stable` will be live on 2025-03-04. 
+
+:::
+
+These are the changes since `v1.61.13-stable`.
+
+This release is primarily focused on:
+- LLM Translation improvements (claude-3-7-sonnet + 'thinking'/'reasoning_content' support)
+- UI improvements (add model flow, user management, etc)
+
+## New Models / Updated Models
+
+1. Anthropic 3-7 sonnet support + cost tracking (Anthropic API + Bedrock + Vertex AI + OpenRouter) 
+    1. Anthropic API [Start here](https://docs.litellm.ai/docs/providers/anthropic#usage---thinking--reasoning_content)
+    2. Bedrock API [Start here](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
+    3. Vertex AI API [See here](../../docs/providers/vertex#usage---thinking--reasoning_content)
+    4. OpenRouter [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L5626)
+2. Gpt-4.5-preview support + cost tracking [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L79)
+3. Azure AI - Phi-4 cost tracking [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L1773)
+4. Claude-3.5-sonnet - vision support updated on Anthropic API [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L2888)
+5. Bedrock llama vision support [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L7714)
+6. Cerebras llama3.3-70b pricing [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L2697)
+
+## LLM Translation
+
+1. Infinity Rerank - support returning documents when return_documents=True [Start here](../../docs/providers/infinity#usage---returning-documents)
+2. Amazon Deepseek - `<think>` param extraction into ‘reasoning_content’ [Start here](https://docs.litellm.ai/docs/providers/bedrock#bedrock-imported-models-deepseek-deepseek-r1)
+3. Amazon Titan Embeddings - filter out ‘aws_’ params from request body [Start here](https://docs.litellm.ai/docs/providers/bedrock#bedrock-embedding)
+4. Anthropic ‘thinking’ + ‘reasoning_content’ translation support (Anthropic API, Bedrock, Vertex AI)  [Start here](https://docs.litellm.ai/docs/reasoning_content)
+5. VLLM - support ‘video_url’ [Start here](../../docs/providers/vllm#send-video-url-to-vllm)
+6. Call proxy via litellm SDK: Support `litellm_proxy/` for embedding, image_generation, transcription, speech, rerank [Start here](https://docs.litellm.ai/docs/providers/litellm_proxy)
+7. OpenAI Pass-through - allow using Assistants GET, DELETE on /openai pass through routes [Start here](https://docs.litellm.ai/docs/pass_through/openai_passthrough)
+8. Message Translation - fix openai message for assistant msg if role is missing - openai allows this
+9. O1/O3 - support ‘drop_params’ for o3-mini and o1 parallel_tool_calls param (not supported currently) [See here](https://docs.litellm.ai/docs/completion/drop_params)
+
+## Spend Tracking Improvements
+
+1. Cost tracking for rerank via Bedrock [See PR](https://github.com/BerriAI/litellm/commit/b682dc4ec8fd07acf2f4c981d2721e36ae2a49c5)
+2. Anthropic pass-through - fix race condition causing cost to not be tracked [See PR](https://github.com/BerriAI/litellm/pull/8874)
+3. Anthropic pass-through: Ensure accurate token counting [See PR](https://github.com/BerriAI/litellm/pull/8880)
+
+## Management Endpoints / UI
+
+1. Models Page - Allow sorting models by ‘created at’
+2. Models Page - Edit Model Flow Improvements
+3. Models Page - Fix Adding Azure, Azure AI Studio models on UI 
+4. Internal Users Page - Allow Bulk Adding Internal Users on UI 
+5. Internal Users Page - Allow sorting users by ‘created at’ 
+6. Virtual Keys Page - Allow searching for UserIDs on the dropdown when assigning a user to a team [See PR](https://github.com/BerriAI/litellm/pull/8844)
+7. Virtual Keys Page - allow creating a user when assigning keys to users [See PR](https://github.com/BerriAI/litellm/pull/8844)
+8. Model Hub Page  - fix text overflow issue [See PR](https://github.com/BerriAI/litellm/pull/8749)
+9. Admin Settings Page - Allow adding MSFT SSO on UI 
+10. Backend - don't allow creating duplicate internal users in DB
+
+## Helm
+
+1. support ttlSecondsAfterFinished on the migration job - [See PR](https://github.com/BerriAI/litellm/pull/8593)
+2. enhance migrations job with additional configurable properties - [See PR](https://github.com/BerriAI/litellm/pull/8636)
+
+## Logging / Guardrail Integrations
+
+1. Arize Phoenix support 
+2. ‘No-log’ - fix ‘no-log’ param support on embedding calls 
+
+## Performance / Loadbalancing / Reliability improvements
+
+1. Single Deployment Cooldown logic - Use allowed_fails or allowed_fail_policy if set [Start here](https://docs.litellm.ai/docs/routing#advanced-custom-retries-cooldowns-based-on-error-type)
+
+## General Proxy Improvements
+
+1. Hypercorn - fix reading / parsing request body 
+2. Windows - fix running proxy in windows 
+3. DD-Trace - fix dd-trace enablement on proxy