diff --git a/.gitignore b/.gitignore
index e8e8aed2b1..6e5f15dbe8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,3 +66,4 @@ litellm/tests/langfuse.log
 litellm/tests/langfuse.log
 litellm/proxy/google-cloud-sdk/*
 tests/llm_translation/log.txt
+venv/
diff --git a/litellm/llms/azure/azure.py b/litellm/llms/azure/azure.py
index c2be42648c..b38c7abbcb 100644
--- a/litellm/llms/azure/azure.py
+++ b/litellm/llms/azure/azure.py
@@ -1386,7 +1386,7 @@ class AzureChatCompletion(BaseLLM):
             input=input,
             **optional_params,
         )
-        return response
+        return HttpxBinaryResponseContent(response=response.response)
 
     async def async_audio_speech(
         self,
@@ -1415,14 +1415,14 @@ class AzureChatCompletion(BaseLLM):
             client_type="async",
         )  # type: ignore
 
-        response = await azure_client.audio.speech.create(
+        azure_response = await azure_client.audio.speech.create(
             model=model,
             voice=voice,  # type: ignore
             input=input,
             **optional_params,
         )
 
-        return response
+        return HttpxBinaryResponseContent(response=azure_response.response)
 
     def get_headers(
         self,
diff --git a/litellm/llms/azure/files/handler.py b/litellm/llms/azure/files/handler.py
index b299145ad4..fd1ef0d535 100644
--- a/litellm/llms/azure/files/handler.py
+++ b/litellm/llms/azure/files/handler.py
@@ -1,4 +1,4 @@
-from typing import Any, Coroutine, Optional, Union
+from typing import Any, Coroutine, Optional, Union, cast
 
 import httpx
 from openai import AsyncAzureOpenAI, AzureOpenAI
@@ -111,7 +111,7 @@ class AzureOpenAIFilesAPI(BaseLLM):
         openai_client: AsyncAzureOpenAI,
     ) -> HttpxBinaryResponseContent:
         response = await openai_client.files.content(**file_content_request)
-        return response
+        return HttpxBinaryResponseContent(response=response.response)
 
     def file_content(
         self,
@@ -152,9 +152,11 @@ class AzureOpenAIFilesAPI(BaseLLM):
                 file_content_request=file_content_request,
                 openai_client=openai_client,
             )
-        response = openai_client.files.content(**file_content_request)
+        response = cast(AzureOpenAI, openai_client).files.content(
+            **file_content_request
+        )
 
-        return response
+        return HttpxBinaryResponseContent(response=response.response)
 
     async def aretrieve_file(
         self,
diff --git a/litellm/llms/openai/openai.py b/litellm/llms/openai/openai.py
index 90b642a567..e73c1d55ec 100644
--- a/litellm/llms/openai/openai.py
+++ b/litellm/llms/openai/openai.py
@@ -1253,13 +1253,13 @@ class OpenAIChatCompletion(BaseLLM):
             client=client,
         )
 
-        response = openai_client.audio.speech.create(
+        response = cast(OpenAI, openai_client).audio.speech.create(
             model=model,
             voice=voice,  # type: ignore
             input=input,
             **optional_params,
         )
-        return response  # type: ignore
+        return HttpxBinaryResponseContent(response=response.response)
 
     async def async_audio_speech(
         self,
@@ -1276,13 +1276,16 @@ class OpenAIChatCompletion(BaseLLM):
         client=None,
     ) -> HttpxBinaryResponseContent:
 
-        openai_client = self._get_openai_client(
-            is_async=True,
-            api_key=api_key,
-            api_base=api_base,
-            timeout=timeout,
-            max_retries=max_retries,
-            client=client,
+        openai_client = cast(
+            AsyncOpenAI,
+            self._get_openai_client(
+                is_async=True,
+                api_key=api_key,
+                api_base=api_base,
+                timeout=timeout,
+                max_retries=max_retries,
+                client=client,
+            ),
         )
 
         response = await openai_client.audio.speech.create(
@@ -1292,7 +1295,7 @@ class OpenAIChatCompletion(BaseLLM):
             **optional_params,
         )
 
-        return response
+        return HttpxBinaryResponseContent(response=response.response)
 
     async def ahealth_check(
         self,
@@ -1477,7 +1480,7 @@ class OpenAIFilesAPI(BaseLLM):
         openai_client: AsyncOpenAI,
     ) -> HttpxBinaryResponseContent:
         response = await openai_client.files.content(**file_content_request)
-        return response
+        return HttpxBinaryResponseContent(response=response.response)
 
     def file_content(
         self,
@@ -1515,9 +1518,9 @@ class OpenAIFilesAPI(BaseLLM):
                 file_content_request=file_content_request,
                 openai_client=openai_client,
             )
-        response = openai_client.files.content(**file_content_request)
+        response = cast(OpenAI, openai_client).files.content(**file_content_request)
 
-        return response
+        return HttpxBinaryResponseContent(response=response.response)
 
     async def aretrieve_file(
         self,
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index 2a22c10456..3aeccebf60 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -5986,6 +5986,15 @@
         "litellm_provider": "bedrock",
         "mode": "embedding"
     },
+    "meta.llama3-3-70b-instruct-v1:0": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000072,
+        "output_cost_per_token": 0.00000072,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
     "meta.llama2-13b-chat-v1": {
         "max_tokens": 4096, 
         "max_input_tokens": 4096, 
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index a46e8d1eaf..a556dd1e2c 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -12,12 +12,6 @@ model_list:
       model: openai/gpt-4o
       api_key: os.environ/OPENAI_API_KEY
       num_retries: 3
-  - model_name: azure-tts
-    litellm_params:
-      model: azure/tts-1
-      api_key: "6a4ULQtTSBZi5TONpzGP2GkKDUSSReHFQYwjMpZaywNxY03FsplGJQQJ99ALACHrzpqXJ3w3AAAAACOGUsjy"
-      api_base: "https://krris-m4rve6fd-northcentralus.cognitiveservices.azure.com/openai/deployments/tts"
-      api_version: "2024-05-01-preview"
 
 litellm_settings:
-  success_callback: ["langfuse"]
\ No newline at end of file
+  success_callback: ["langfuse"]
diff --git a/litellm/proxy/spend_tracking/spend_tracking_utils.py b/litellm/proxy/spend_tracking/spend_tracking_utils.py
index 355a476d47..3240bfb8a3 100644
--- a/litellm/proxy/spend_tracking/spend_tracking_utils.py
+++ b/litellm/proxy/spend_tracking/spend_tracking_utils.py
@@ -1,7 +1,7 @@
 import json
 import secrets
 from datetime import datetime as dt
-from typing import Optional
+from typing import Optional, cast
 
 from pydantic import BaseModel
 
@@ -40,7 +40,9 @@ def get_logging_payload(
 
     if kwargs is None:
         kwargs = {}
-    if response_obj is None:
+    if response_obj is None or (
+        not isinstance(response_obj, BaseModel) and not isinstance(response_obj, dict)
+    ):
         response_obj = {}
     # standardize this function to be used across, s3, dynamoDB, langfuse logging
     litellm_params = kwargs.get("litellm_params", {})
@@ -50,10 +52,10 @@ def get_logging_payload(
     completion_start_time = kwargs.get("completion_start_time", end_time)
     call_type = kwargs.get("call_type")
     cache_hit = kwargs.get("cache_hit", False)
-    usage = response_obj.get("usage", None) or {}
+    usage = cast(dict, response_obj).get("usage", None) or {}
     if isinstance(usage, litellm.Usage):
         usage = dict(usage)
-    id = response_obj.get("id") or kwargs.get("litellm_call_id")
+    id = cast(dict, response_obj).get("id") or kwargs.get("litellm_call_id")
     api_key = metadata.get("user_api_key", "")
     if api_key is not None and isinstance(api_key, str):
         if api_key.startswith("sk-"):
diff --git a/litellm/rerank_api/main.py b/litellm/rerank_api/main.py
index 57a353adbe..315109280c 100644
--- a/litellm/rerank_api/main.py
+++ b/litellm/rerank_api/main.py
@@ -12,7 +12,7 @@ from litellm.llms.custom_httpx.llm_http_handler import BaseLLMHTTPHandler
 from litellm.llms.jina_ai.rerank.handler import JinaAIRerank
 from litellm.llms.together_ai.rerank.handler import TogetherAIRerank
 from litellm.rerank_api.rerank_utils import get_optional_rerank_params
-from litellm.secret_managers.main import get_secret
+from litellm.secret_managers.main import get_secret, get_secret_str
 from litellm.types.rerank import OptionalRerankParams, RerankResponse
 from litellm.types.router import *
 from litellm.utils import ProviderConfigManager, client, exception_type
@@ -211,7 +211,7 @@ def rerank(  # noqa: PLR0915
                 dynamic_api_base
                 or optional_params.api_base
                 or litellm.api_base
-                or get_secret("INFINITY_API_BASE")  # type: ignore
+                or get_secret_str("INFINITY_API_BASE")
             )
 
             if api_base is None:
diff --git a/litellm/types/llms/openai.py b/litellm/types/llms/openai.py
index 26c0eab3a0..e544cf24d2 100644
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@@ -1,7 +1,9 @@
 from os import PathLike
 from typing import IO, Any, Iterable, List, Literal, Mapping, Optional, Tuple, Union
 
-from openai._legacy_response import HttpxBinaryResponseContent
+from openai._legacy_response import (
+    HttpxBinaryResponseContent as _HttpxBinaryResponseContent,
+)
 from openai.lib.streaming._assistants import (
     AssistantEventHandler,
     AssistantStreamManager,
@@ -48,6 +50,11 @@ FileTypes = Union[
 EmbeddingInput = Union[str, List[str]]
 
 
+class HttpxBinaryResponseContent(_HttpxBinaryResponseContent):
+    _hidden_params: dict = {}
+    pass
+
+
 class NotGiven:
     """
     A sentinel singleton class used to distinguish omitted keyword arguments
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 2a22c10456..3aeccebf60 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -5986,6 +5986,15 @@
         "litellm_provider": "bedrock",
         "mode": "embedding"
     },
+    "meta.llama3-3-70b-instruct-v1:0": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000072,
+        "output_cost_per_token": 0.00000072,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
     "meta.llama2-13b-chat-v1": {
         "max_tokens": 4096, 
         "max_input_tokens": 4096, 
diff --git a/tests/local_testing/test_amazing_vertex_completion.py b/tests/local_testing/test_amazing_vertex_completion.py
index bb7b38addc..d7a76e8542 100644
--- a/tests/local_testing/test_amazing_vertex_completion.py
+++ b/tests/local_testing/test_amazing_vertex_completion.py
@@ -210,7 +210,7 @@ async def test_get_router_response():
 #     reason="Local test. Vertex AI Quota is low. Leads to rate limit errors on ci/cd."
 # )
 # @pytest.mark.flaky(retries=3, delay=1)
-def test_vertex_ai_anthropic():
+def test_aavertex_ai_anthropic():
     model = "claude-3-sonnet@20240229"
 
     vertex_ai_project = "adroit-crow-413218"
diff --git a/tests/local_testing/test_audio_speech.py b/tests/local_testing/test_audio_speech.py
index 9e60add17a..da8872a785 100644
--- a/tests/local_testing/test_audio_speech.py
+++ b/tests/local_testing/test_audio_speech.py
@@ -61,7 +61,7 @@ async def test_audio_speech_litellm(sync_mode, model, api_base, api_key):
             optional_params={},
         )
 
-        from litellm.llms.openai.openai import HttpxBinaryResponseContent
+        from litellm.types.llms.openai import HttpxBinaryResponseContent
 
         assert isinstance(response, HttpxBinaryResponseContent)
     else: