From a42f008cd01683fbba622d4fbaff801f4fc621e2 Mon Sep 17 00:00:00 2001
From: Krish Dholakia <krrishdholakia@gmail.com>
Date: Fri, 13 Dec 2024 08:54:03 -0800
Subject: [PATCH] Litellm dev 12 12 2024 (#7203)

* fix(azure/): support passing headers to azure openai endpoints

Fixes https://github.com/BerriAI/litellm/issues/6217

* fix(utils.py): move default tokenizer to just openai

hf tokenizer makes network calls when trying to get the tokenizer - this slows down execution time calls

* fix(router.py): fix pattern matching router - add generic "*" to it as well

Fixes issue where generic "*" model access group wouldn't show up

* fix(pattern_match_deployments.py): match to more specific pattern

match to more specific pattern

allows setting generic wildcard model access group and excluding specific models more easily

* fix(proxy_server.py): fix _delete_deployment to handle base case where db_model list is empty

don't delete all router models  b/c of empty list

Fixes https://github.com/BerriAI/litellm/issues/7196

* fix(anthropic/): fix handling response_format for anthropic messages with anthropic api

* fix(fireworks_ai/): support passing response_format + tool call in same message

Addresses https://github.com/BerriAI/litellm/issues/7135

* Revert "fix(fireworks_ai/): support passing response_format + tool call in same message"

This reverts commit 6a30dc692986a513cfb99c7a10c7cd34d8b93a4f.

* test: fix test

* fix(replicate/): fix replicate default retry/polling logic

* test: add unit testing for router pattern matching

* test: update test to use default oai tokenizer

* test: mark flaky test

* test: skip flaky test
---
 litellm/__init__.py                           |   2 +
 litellm/constants.py                          |   4 +-
 litellm/llms/anthropic/chat/transformation.py |  78 +++++---
 litellm/llms/azure/azure.py                   |   7 +-
 litellm/llms/replicate/chat/handler.py        |   4 +-
 litellm/main.py                               |   5 +-
 litellm/proxy/_new_secret_config.yaml         |  26 +--
 litellm/proxy/auth/auth_checks.py             |   2 +-
 litellm/proxy/proxy_server.py                 |  39 ++--
 litellm/router.py                             |  14 +-
 .../router_utils/pattern_match_deployments.py |  45 ++++-
 litellm/utils.py                              |  21 +-
 .../test_anthropic_completion.py              | 186 ++++++++++++++++++
 tests/llm_translation/test_azure_openai.py    |  26 ++-
 tests/local_testing/test_audio_speech.py      |   1 +
 tests/local_testing/test_completion.py        |   1 +
 tests/local_testing/test_config.py            |  56 ++++++
 .../test_router_pattern_matching.py           |  78 +++++++-
 .../test_proxy_token_counter.py               |   4 +-
 19 files changed, 496 insertions(+), 103 deletions(-)

diff --git a/litellm/__init__.py b/litellm/__init__.py
index 6d76e8b3d2..2d2b66795d 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -22,6 +22,8 @@ from litellm.constants import (
     DEFAULT_FLUSH_INTERVAL_SECONDS,
     ROUTER_MAX_FALLBACKS,
     DEFAULT_MAX_RETRIES,
+    DEFAULT_REPLICATE_POLLING_RETRIES,
+    DEFAULT_REPLICATE_POLLING_DELAY_SECONDS,
     LITELLM_CHAT_PROVIDERS,
 )
 from litellm.types.guardrails import GuardrailItem
diff --git a/litellm/constants.py b/litellm/constants.py
index 36c6745123..0c1b4a73d9 100644
--- a/litellm/constants.py
+++ b/litellm/constants.py
@@ -2,6 +2,8 @@ ROUTER_MAX_FALLBACKS = 5
 DEFAULT_BATCH_SIZE = 512
 DEFAULT_FLUSH_INTERVAL_SECONDS = 5
 DEFAULT_MAX_RETRIES = 2
+DEFAULT_REPLICATE_POLLING_RETRIES = 5
+DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = 1
 DEFAULT_IMAGE_TOKEN_COUNT = 250
 DEFAULT_IMAGE_WIDTH = 300
 DEFAULT_IMAGE_HEIGHT = 300
@@ -67,6 +69,7 @@ LITELLM_CHAT_PROVIDERS = [
     "galadriel",
 ]
 
+RESPONSE_FORMAT_TOOL_NAME = "json_tool_call"  # default tool name used when converting response format to tool call
 
 ########################### LiteLLM Proxy Specific Constants ###########################
 MAX_SPENDLOG_ROWS_TO_QUERY = (
@@ -74,4 +77,3 @@ MAX_SPENDLOG_ROWS_TO_QUERY = (
 )
 # makes it clear this is a rate limit error for a litellm virtual key
 RATE_LIMIT_ERROR_MESSAGE_FOR_VIRTUAL_KEY = "LiteLLM Virtual Key user_api_key_hash"
-
diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py
index 860ba5eae8..30f87d5456 100644
--- a/litellm/llms/anthropic/chat/transformation.py
+++ b/litellm/llms/anthropic/chat/transformation.py
@@ -19,9 +19,10 @@ import httpx
 import requests
 
 import litellm
+from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
-from litellm.llms.base_llm.transformation import BaseConfig, BaseLLMException
 from litellm.litellm_core_utils.prompt_templates.factory import anthropic_messages_pt
+from litellm.llms.base_llm.transformation import BaseConfig, BaseLLMException
 from litellm.types.llms.anthropic import (
     AllAnthropicToolsValues,
     AnthropicComputerTool,
@@ -298,6 +299,18 @@ class AnthropicConfig(BaseConfig):
                 new_stop = new_v
         return new_stop
 
+    def _add_tools_to_optional_params(
+        self, optional_params: dict, tools: List[AllAnthropicToolsValues]
+    ) -> dict:
+        if "tools" not in optional_params:
+            optional_params["tools"] = tools
+        else:
+            optional_params["tools"] = [
+                *optional_params["tools"],
+                *tools,
+            ]
+        return optional_params
+
     def map_openai_params(
         self,
         non_default_params: dict,
@@ -311,7 +324,11 @@ class AnthropicConfig(BaseConfig):
             if param == "max_completion_tokens":
                 optional_params["max_tokens"] = value
             if param == "tools":
-                optional_params["tools"] = self._map_tools(value)
+                # check if optional params already has tools
+                tool_value = self._map_tools(value)
+                optional_params = self._add_tools_to_optional_params(
+                    optional_params=optional_params, tools=tool_value
+                )
             if param == "tool_choice" or param == "parallel_tool_calls":
                 _tool_choice: Optional[AnthropicMessagesToolChoice] = (
                     self._map_tool_choice(
@@ -333,6 +350,7 @@ class AnthropicConfig(BaseConfig):
             if param == "top_p":
                 optional_params["top_p"] = value
             if param == "response_format" and isinstance(value, dict):
+
                 json_schema: Optional[dict] = None
                 if "response_schema" in value:
                     json_schema = value["response_schema"]
@@ -344,11 +362,14 @@ class AnthropicConfig(BaseConfig):
                 - You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool
                 - Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective.
                 """
-                _tool_choice = {"name": "json_tool_call", "type": "tool"}
+
+                _tool_choice = {"name": RESPONSE_FORMAT_TOOL_NAME, "type": "tool"}
                 _tool = self._create_json_tool_call_for_response_format(
                     json_schema=json_schema,
                 )
-                optional_params["tools"] = [_tool]
+                optional_params = self._add_tools_to_optional_params(
+                    optional_params=optional_params, tools=[_tool]
+                )
                 optional_params["tool_choice"] = _tool_choice
                 optional_params["json_mode"] = True
             if param == "user":
@@ -381,7 +402,9 @@ class AnthropicConfig(BaseConfig):
         else:
             _input_schema["properties"] = {"values": json_schema}
 
-        _tool = AnthropicMessagesTool(name="json_tool_call", input_schema=_input_schema)
+        _tool = AnthropicMessagesTool(
+            name=RESPONSE_FORMAT_TOOL_NAME, input_schema=_input_schema
+        )
         return _tool
 
     def is_cache_control_set(self, messages: List[AllMessageValues]) -> bool:
@@ -537,10 +560,6 @@ class AnthropicConfig(BaseConfig):
             ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
                 optional_params[k] = v
 
-        ## Handle Tool Calling
-        if "tools" in optional_params:
-            _is_function_call = True
-
         ## Handle user_id in metadata
         _litellm_metadata = litellm_params.get("metadata", None)
         if (
@@ -558,6 +577,26 @@ class AnthropicConfig(BaseConfig):
 
         return data
 
+    def _transform_response_for_json_mode(
+        self,
+        json_mode: Optional[bool],
+        tool_calls: List[ChatCompletionToolCallChunk],
+    ) -> Optional[LitellmMessage]:
+        _message: Optional[LitellmMessage] = None
+        if json_mode is True and len(tool_calls) == 1:
+            # check if tool name is the default tool name
+            json_mode_content_str: Optional[str] = None
+            if (
+                "name" in tool_calls[0]["function"]
+                and tool_calls[0]["function"]["name"] == RESPONSE_FORMAT_TOOL_NAME
+            ):
+                json_mode_content_str = tool_calls[0]["function"].get("arguments")
+            if json_mode_content_str is not None:
+                _message = AnthropicConfig._convert_tool_response_to_message(
+                    tool_calls=tool_calls,
+                )
+        return _message
+
     def transform_response(
         self,
         model: str,
@@ -629,19 +668,14 @@ class AnthropicConfig(BaseConfig):
             )
 
             ## HANDLE JSON MODE - anthropic returns single function call
-            if json_mode is True and len(tool_calls) == 1:
-                json_mode_content_str: Optional[str] = tool_calls[0]["function"].get(
-                    "arguments"
-                )
-                if json_mode_content_str is not None:
-                    _converted_message = (
-                        AnthropicConfig._convert_tool_response_to_message(
-                            tool_calls=tool_calls,
-                        )
-                    )
-                    if _converted_message is not None:
-                        completion_response["stop_reason"] = "stop"
-                        _message = _converted_message
+            json_mode_message = self._transform_response_for_json_mode(
+                json_mode=json_mode,
+                tool_calls=tool_calls,
+            )
+            if json_mode_message is not None:
+                completion_response["stop_reason"] = "stop"
+                _message = json_mode_message
+
             model_response.choices[0].message = _message  # type: ignore
             model_response._hidden_params["original_response"] = completion_response[
                 "content"
diff --git a/litellm/llms/azure/azure.py b/litellm/llms/azure/azure.py
index 2735884f70..33261a7a7e 100644
--- a/litellm/llms/azure/azure.py
+++ b/litellm/llms/azure/azure.py
@@ -342,7 +342,8 @@ class AzureChatCompletion(BaseLLM):
         headers: Optional[dict] = None,
         client=None,
     ):
-        super().completion()
+        if headers:
+            optional_params["extra_headers"] = headers
         try:
             if model is None or messages is None:
                 raise AzureOpenAIError(
@@ -851,8 +852,10 @@ class AzureChatCompletion(BaseLLM):
         max_retries: Optional[int] = None,
         client=None,
         aembedding=None,
+        headers: Optional[dict] = None,
     ) -> litellm.EmbeddingResponse:
-        super().embedding()
+        if headers:
+            optional_params["extra_headers"] = headers
         if self._client_session is None:
             self._client_session = self.create_client_session()
         try:
diff --git a/litellm/llms/replicate/chat/handler.py b/litellm/llms/replicate/chat/handler.py
index 4f7a115699..7d1a86fa00 100644
--- a/litellm/llms/replicate/chat/handler.py
+++ b/litellm/llms/replicate/chat/handler.py
@@ -259,9 +259,9 @@ async def async_completion(
         )
         return CustomStreamWrapper(_response, model, logging_obj=logging_obj, custom_llm_provider="replicate")  # type: ignore
 
-    for _ in range(litellm.DEFAULT_MAX_RETRIES):
+    for _ in range(litellm.DEFAULT_REPLICATE_POLLING_RETRIES):
         await asyncio.sleep(
-            1
+            litellm.DEFAULT_REPLICATE_POLLING_DELAY_SECONDS
         )  # wait 1s to allow response to be generated by replicate - else partial output is generated with status=="processing"
         response = await async_handler.get(url=prediction_url, headers=headers)
         return litellm.ReplicateConfig().transform_response(
diff --git a/litellm/main.py b/litellm/main.py
index b0d87e41d8..c012fbc5bd 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -3171,6 +3171,7 @@ def embedding(  # noqa: PLR0915
     proxy_server_request = kwargs.get("proxy_server_request", None)
     aembedding = kwargs.get("aembedding", None)
     extra_headers = kwargs.get("extra_headers", None)
+    headers = kwargs.get("headers", None)
     ### CUSTOM MODEL COST ###
     input_cost_per_token = kwargs.get("input_cost_per_token", None)
     output_cost_per_token = kwargs.get("output_cost_per_token", None)
@@ -3281,9 +3282,6 @@ def embedding(  # noqa: PLR0915
                 "azure_ad_token", None
             ) or get_secret_str("AZURE_AD_TOKEN")
 
-            if extra_headers is not None:
-                optional_params["extra_headers"] = extra_headers
-
             api_key = (
                 api_key
                 or litellm.api_key
@@ -3311,6 +3309,7 @@ def embedding(  # noqa: PLR0915
                 client=client,
                 aembedding=aembedding,
                 max_retries=max_retries,
+                headers=headers or extra_headers,
             )
         elif (
             model in litellm.open_ai_embedding_models
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index 84075f53e0..a66057ae30 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -5,26 +5,8 @@ model_list:
       api_key: os.environ/AZURE_API_KEY
       api_base: os.environ/AZURE_API_BASE
       temperature: 0.2
-
-guardrails:
-  - guardrail_name: "presidio-log-guard"
+  - model_name: "*"
     litellm_params:
-      guardrail: presidio
-      mode: "logging_only"
-      mock_redacted_text:
-        text: "hello world, my name is <PERSON>. My number is: <PHONE_NUMBER>"
-        items:
-          - start: 48
-            end: 62
-            entity_type: PHONE_NUMBER
-            text: "<PHONE_NUMBER>"
-            operator: replace
-          - start: 24
-            end: 32
-            entity_type: PERSON
-            text: "<PERSON>"
-            operator: replace
-
-litellm_settings:
-    set_verbose: true
-    success_callback: ["langfuse"]
\ No newline at end of file
+      model: "*"
+    model_info:
+      access_groups: ["default"]
\ No newline at end of file
diff --git a/litellm/proxy/auth/auth_checks.py b/litellm/proxy/auth/auth_checks.py
index 3b43ec32e1..b74e5199e8 100644
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@@ -757,6 +757,7 @@ async def get_key_object(
     except DB_CONNECTION_ERROR_TYPES as e:
         return await _handle_failed_db_connection_for_get_key_object(e=e)
     except Exception:
+        traceback.print_exc()
         raise Exception(
             f"Key doesn't exist in db. key={hashed_token}. Create key via `/key/generate` call."
         )
@@ -870,7 +871,6 @@ async def can_key_call_model(
     access_groups = defaultdict(list)
     if llm_router:
         access_groups = llm_router.get_model_access_groups(model_name=model)
-
     if (
         len(access_groups) > 0 and llm_router is not None
     ):  # check if token contains any model access groups
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index f002306367..c6801f18fd 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -25,8 +25,6 @@ from typing import (
     get_type_hints,
 )
 
-import requests
-
 if TYPE_CHECKING:
     from opentelemetry.trace import Span as _Span
 
@@ -120,7 +118,7 @@ from litellm.litellm_core_utils.core_helpers import (
     _get_parent_otel_span_from_kwargs,
     get_litellm_metadata_from_kwargs,
 )
-from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.proxy._types import *
 from litellm.proxy.analytics_endpoints.analytics_endpoints import (
     router as analytics_router,
@@ -528,7 +526,7 @@ async_result = None
 celery_app_conn = None
 celery_fn = None  # Redis Queue for handling requests
 ### DB WRITER ###
-db_writer_client: Optional[HTTPHandler] = None
+db_writer_client: Optional[AsyncHTTPHandler] = None
 ### logger ###
 
 
@@ -2092,7 +2090,10 @@ class ProxyConfig:
         """
         global user_config_file_path, llm_router
         combined_id_list = []
-        if llm_router is None:
+
+        ## BASE CASES ##
+        # if llm_router is None or db_models is empty, return 0
+        if llm_router is None or len(db_models) == 0:
             return 0
 
         ## DB MODELS ##
@@ -2422,6 +2423,19 @@ class ProxyConfig:
 
         return config
 
+    async def _get_models_from_db(self, prisma_client: PrismaClient) -> list:
+        try:
+            new_models = await prisma_client.db.litellm_proxymodeltable.find_many()
+        except Exception as e:
+            verbose_proxy_logger.exception(
+                "litellm.proxy_server.py::add_deployment() - Error getting new models from DB - {}".format(
+                    str(e)
+                )
+            )
+            new_models = []
+
+        return new_models
+
     async def add_deployment(
         self,
         prisma_client: PrismaClient,
@@ -2439,15 +2453,9 @@ class ProxyConfig:
                 raise ValueError(
                     f"Master key is not initialized or formatted. master_key={master_key}"
                 )
-            try:
-                new_models = await prisma_client.db.litellm_proxymodeltable.find_many()
-            except Exception as e:
-                verbose_proxy_logger.exception(
-                    "litellm.proxy_server.py::add_deployment() - Error getting new models from DB - {}".format(
-                        str(e)
-                    )
-                )
-                new_models = []
+
+            new_models = await self._get_models_from_db(prisma_client=prisma_client)
+
             # update llm router
             await self._update_llm_router(
                 new_models=new_models, proxy_logging_obj=proxy_logging_obj
@@ -8066,7 +8074,8 @@ def get_image():
     # Check if the logo path is an HTTP/HTTPS URL
     if logo_path.startswith(("http://", "https://")):
         # Download the image and cache it
-        response = requests.get(logo_path)
+        client = HTTPHandler()
+        response = client.get(logo_path)
         if response.status_code == 200:
             # Save the image to a local file
             cache_path = os.path.join(current_dir, "cached_logo.jpg")
diff --git a/litellm/router.py b/litellm/router.py
index d10eee21e0..d54e63e8ab 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -4019,15 +4019,15 @@ class Router:
 
         # Check if user is trying to use model_name == "*"
         # this is a catch all model for their specific api key
-        if deployment.model_name == "*":
-            if deployment.litellm_params.model == "*":
-                # user wants to pass through all requests to litellm.acompletion for unknown deployments
-                self.router_general_settings.pass_through_all_models = True
-            else:
-                self.default_deployment = deployment.to_json(exclude_none=True)
+        # if deployment.model_name == "*":
+        #     if deployment.litellm_params.model == "*":
+        #         # user wants to pass through all requests to litellm.acompletion for unknown deployments
+        #         self.router_general_settings.pass_through_all_models = True
+        #     else:
+        #         self.default_deployment = deployment.to_json(exclude_none=True)
         # Check if user is using provider specific wildcard routing
         # example model_name = "databricks/*" or model_name = "anthropic/*"
-        elif "*" in deployment.model_name:
+        if "*" in deployment.model_name:
             # store this as a regex pattern - all deployments matching this pattern will be sent to this deployment
             # Store deployment.model_name as a regex pattern
             self.pattern_router.add_pattern(
diff --git a/litellm/router_utils/pattern_match_deployments.py b/litellm/router_utils/pattern_match_deployments.py
index a369100eb0..a5e54d898e 100644
--- a/litellm/router_utils/pattern_match_deployments.py
+++ b/litellm/router_utils/pattern_match_deployments.py
@@ -4,13 +4,52 @@ Class to handle llm wildcard routing and regex pattern matching
 
 import copy
 import re
+from functools import cached_property
 from re import Match
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 
 from litellm import get_llm_provider
 from litellm._logging import verbose_router_logger
 
 
+class PatternUtils:
+    @staticmethod
+    def calculate_pattern_specificity(pattern: str) -> Tuple[int, int]:
+        """
+        Calculate pattern specificity based on length and complexity.
+
+        Args:
+            pattern: Regex pattern to analyze
+
+        Returns:
+            Tuple of (length, complexity) for sorting
+        """
+        complexity_chars = ["*", "+", "?", "\\", "^", "$", "|", "(", ")"]
+        ret_val = (
+            len(pattern),  # Longer patterns more specific
+            sum(
+                pattern.count(char) for char in complexity_chars
+            ),  # More regex complexity
+        )
+        return ret_val
+
+    @staticmethod
+    def sorted_patterns(
+        patterns: Dict[str, List[Dict]]
+    ) -> List[Tuple[str, List[Dict]]]:
+        """
+        Cached property for patterns sorted by specificity.
+
+        Returns:
+            Sorted list of pattern-deployment tuples
+        """
+        return sorted(
+            patterns.items(),
+            key=lambda x: PatternUtils.calculate_pattern_specificity(x[0]),
+            reverse=True,
+        )
+
+
 class PatternMatchRouter:
     """
     Class to handle llm wildcard routing and regex pattern matching
@@ -99,13 +138,13 @@ class PatternMatchRouter:
             if request is None:
                 return None
 
+            sorted_patterns = PatternUtils.sorted_patterns(self.patterns)
             regex_filtered_model_names = (
                 [self._pattern_to_regex(m) for m in filtered_model_names]
                 if filtered_model_names is not None
                 else []
             )
-
-            for pattern, llm_deployments in self.patterns.items():
+            for pattern, llm_deployments in sorted_patterns:
                 if (
                     filtered_model_names is not None
                     and pattern not in regex_filtered_model_names
diff --git a/litellm/utils.py b/litellm/utils.py
index e47b5c9d2d..ddaf60d5a1 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -1214,7 +1214,9 @@ def client(original_function):  # noqa: PLR0915
 
 
 @lru_cache(maxsize=128)
-def _select_tokenizer(model: str):
+def _select_tokenizer(
+    model: str,
+):
     if model in litellm.cohere_models and "command-r" in model:
         # cohere
         cohere_tokenizer = Tokenizer.from_pretrained(
@@ -1235,19 +1237,10 @@ def _select_tokenizer(model: str):
         return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
     # default - tiktoken
     else:
-        tokenizer = None
-        if (
-            model in litellm.open_ai_chat_completion_models
-            or model in litellm.open_ai_text_completion_models
-            or model in litellm.open_ai_embedding_models
-        ):
-            return {"type": "openai_tokenizer", "tokenizer": encoding}
-
-        try:
-            tokenizer = Tokenizer.from_pretrained(model)
-            return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
-        except Exception:
-            return {"type": "openai_tokenizer", "tokenizer": encoding}
+        return {
+            "type": "openai_tokenizer",
+            "tokenizer": encoding,
+        }  # default to openai tokenizer
 
 
 def encode(model="", text="", custom_tokenizer: Optional[dict] = None):
diff --git a/tests/llm_translation/test_anthropic_completion.py b/tests/llm_translation/test_anthropic_completion.py
index 967b2d2722..78bf5081bc 100644
--- a/tests/llm_translation/test_anthropic_completion.py
+++ b/tests/llm_translation/test_anthropic_completion.py
@@ -685,6 +685,67 @@ class TestAnthropicCompletion(BaseLLMChatTest):
         """
         pass
 
+    def test_tool_call_and_json_response_format(self):
+        """
+        Test that the tool call and JSON response format is supported by the LLM API
+        """
+        litellm.set_verbose = True
+        from pydantic import BaseModel
+        from litellm.utils import supports_response_schema
+
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+        litellm.model_cost = litellm.get_model_cost_map(url="")
+
+        class RFormat(BaseModel):
+            question: str
+            answer: str
+
+        base_completion_call_args = self.get_base_completion_call_args()
+        if not supports_response_schema(base_completion_call_args["model"], None):
+            pytest.skip("Model does not support response schema")
+
+        try:
+            res = litellm.completion(
+                **base_completion_call_args,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "response user question with JSON object",
+                    },
+                    {"role": "user", "content": "Hey! What's the weather in NewYork?"},
+                ],
+                tool_choice="required",
+                response_format=RFormat,
+                tools=[
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": "get_current_weather",
+                            "description": "Get the current weather in a given location",
+                            "parameters": {
+                                "type": "object",
+                                "properties": {
+                                    "location": {
+                                        "type": "string",
+                                        "description": "The city and state, e.g. San Francisco, CA",
+                                    },
+                                    "unit": {
+                                        "type": "string",
+                                        "enum": ["celsius", "fahrenheit"],
+                                    },
+                                },
+                                "required": ["location"],
+                            },
+                        },
+                    }
+                ],
+            )
+            assert res is not None
+
+            assert res.choices[0].message.tool_calls is not None
+        except litellm.InternalServerError:
+            pytest.skip("Model is overloaded")
+
 
 def test_convert_tool_response_to_message_with_values():
     """Test converting a tool response with 'values' key to a message"""
@@ -829,3 +890,128 @@ def test_anthropic_tool_with_image():
     )
 
     assert b64_data in json.dumps(result)
+
+
+def test_anthropic_map_openai_params_tools_and_json_schema():
+    import json
+
+    args = {
+        "non_default_params": {
+            "response_format": {
+                "type": "json_schema",
+                "json_schema": {
+                    "schema": {
+                        "properties": {
+                            "question": {"title": "Question", "type": "string"},
+                            "answer": {"title": "Answer", "type": "string"},
+                        },
+                        "required": ["question", "answer"],
+                        "title": "RFormat",
+                        "type": "object",
+                        "additionalProperties": False,
+                    },
+                    "name": "RFormat",
+                    "strict": True,
+                },
+            },
+            "tools": [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "get_current_weather",
+                        "description": "Get the current weather in a given location",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {
+                                "location": {
+                                    "type": "string",
+                                    "description": "The city and state, e.g. San Francisco, CA",
+                                },
+                                "unit": {
+                                    "type": "string",
+                                    "enum": ["celsius", "fahrenheit"],
+                                },
+                            },
+                            "required": ["location"],
+                        },
+                    },
+                }
+            ],
+            "tool_choice": "required",
+        }
+    }
+
+    mapped_params = litellm.AnthropicConfig().map_openai_params(
+        non_default_params=args["non_default_params"],
+        optional_params={},
+        model="claude-3-5-sonnet-20240620",
+        drop_params=False,
+    )
+
+    assert "Question" in json.dumps(mapped_params)
+
+
+from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
+
+
+@pytest.mark.parametrize(
+    "json_mode, tool_calls, expect_null_response",
+    [
+        (
+            True,
+            [
+                {
+                    "id": "toolu_013JszbnYBVygTxh6EGHEHia",
+                    "type": "function",
+                    "function": {
+                        "name": "get_current_weather",
+                        "arguments": '{"location": "New York, NY"}',
+                    },
+                    "index": 0,
+                }
+            ],
+            True,
+        ),
+        (
+            True,
+            [
+                {
+                    "id": "toolu_013JszbnYBVygTxh6EGHEHia",
+                    "type": "function",
+                    "function": {
+                        "name": RESPONSE_FORMAT_TOOL_NAME,
+                        "arguments": '{"location": "New York, NY"}',
+                    },
+                    "index": 0,
+                }
+            ],
+            False,
+        ),
+        (
+            False,
+            [
+                {
+                    "id": "toolu_013JszbnYBVygTxh6EGHEHia",
+                    "type": "function",
+                    "function": {
+                        "name": RESPONSE_FORMAT_TOOL_NAME,
+                        "arguments": '{"location": "New York, NY"}',
+                    },
+                    "index": 0,
+                }
+            ],
+            True,
+        ),
+    ],
+)
+def test_anthropic_json_mode_and_tool_call_response(
+    json_mode, tool_calls, expect_null_response
+):
+    result = litellm.AnthropicConfig()._transform_response_for_json_mode(
+        json_mode=json_mode,
+        tool_calls=tool_calls,
+    )
+
+    assert (
+        result is None if expect_null_response else result is not None
+    ), f"Expected result to be {None if expect_null_response else 'not None'}, but got {result}"
diff --git a/tests/llm_translation/test_azure_openai.py b/tests/llm_translation/test_azure_openai.py
index 431fd4347a..7a5d35d282 100644
--- a/tests/llm_translation/test_azure_openai.py
+++ b/tests/llm_translation/test_azure_openai.py
@@ -113,7 +113,14 @@ import os
         ({"prompt": "Hello world"}, "image_generation"),
     ],
 )
-def test_azure_extra_headers(input, call_type):
+@pytest.mark.parametrize(
+    "header_value",
+    [
+        "headers",
+        "extra_headers",
+    ],
+)
+def test_azure_extra_headers(input, call_type, header_value):
     from litellm import embedding, image_generation
 
     http_client = Client()
@@ -128,18 +135,21 @@ def test_azure_extra_headers(input, call_type):
                 func = embedding
             elif call_type == "image_generation":
                 func = image_generation
-            response = func(
-                model="azure/chatgpt-v-2",
-                api_base="https://openai-gpt-4-test-v-1.openai.azure.com",
-                api_version="2023-07-01-preview",
-                api_key="my-azure-api-key",
-                extra_headers={
+
+            data = {
+                "model": "azure/chatgpt-v-2",
+                "api_base": "https://openai-gpt-4-test-v-1.openai.azure.com",
+                "api_version": "2023-07-01-preview",
+                "api_key": "my-azure-api-key",
+                header_value: {
                     "Authorization": "my-bad-key",
                     "Ocp-Apim-Subscription-Key": "hello-world-testing",
                 },
                 **input,
-            )
+            }
+            response = func(**data)
             print(response)
+
         except Exception as e:
             print(e)
 
diff --git a/tests/local_testing/test_audio_speech.py b/tests/local_testing/test_audio_speech.py
index cb7cad6f2d..a003d786d0 100644
--- a/tests/local_testing/test_audio_speech.py
+++ b/tests/local_testing/test_audio_speech.py
@@ -116,6 +116,7 @@ async def test_audio_speech_litellm_vertex(sync_mode):
         response.stream_to_file(speech_file_path)
 
 
+@pytest.mark.flaky(retries=6, delay=2)
 @pytest.mark.asyncio
 async def test_speech_litellm_vertex_async():
     # Mock the response
diff --git a/tests/local_testing/test_completion.py b/tests/local_testing/test_completion.py
index 1c8b04575c..a73a227db6 100644
--- a/tests/local_testing/test_completion.py
+++ b/tests/local_testing/test_completion.py
@@ -3094,6 +3094,7 @@ def test_completion_azure_deployment_id():
 import asyncio
 
 
+@pytest.mark.skip(reason="replicate endpoints are extremely flaky")
 @pytest.mark.parametrize("sync_mode", [False, True])
 @pytest.mark.asyncio
 async def test_completion_replicate_llama3(sync_mode):
diff --git a/tests/local_testing/test_config.py b/tests/local_testing/test_config.py
index a63816e8e2..213f5095ea 100644
--- a/tests/local_testing/test_config.py
+++ b/tests/local_testing/test_config.py
@@ -175,6 +175,62 @@ async def test_add_existing_deployment():
     assert init_len_list == len(llm_router.model_list)
 
 
+@pytest.mark.asyncio
+async def test_db_error_new_model_check():
+    """
+    - if error in db, don't delete existing models
+
+    Relevant issue: https://github.com/BerriAI/litellm/blob/ddfe687b13e9f31db2fb2322887804e3d01dd467/litellm/proxy/proxy_server.py#L2461
+    """
+    import base64
+
+    litellm_params = LiteLLM_Params(
+        model="gpt-3.5-turbo",
+        api_key=os.getenv("AZURE_API_KEY"),
+        api_base=os.getenv("AZURE_API_BASE"),
+        api_version=os.getenv("AZURE_API_VERSION"),
+    )
+    deployment = Deployment(model_name="gpt-3.5-turbo", litellm_params=litellm_params)
+    deployment_2 = Deployment(
+        model_name="gpt-3.5-turbo-2", litellm_params=litellm_params
+    )
+
+    llm_router = litellm.Router(
+        model_list=[
+            deployment.to_json(exclude_none=True),
+            deployment_2.to_json(exclude_none=True),
+        ]
+    )
+
+    init_len_list = len(llm_router.model_list)
+    print(f"llm_router: {llm_router}")
+    master_key = "sk-1234"
+    setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
+    setattr(litellm.proxy.proxy_server, "master_key", master_key)
+    pc = ProxyConfig()
+
+    encrypted_litellm_params = litellm_params.dict(exclude_none=True)
+
+    for k, v in encrypted_litellm_params.items():
+        if isinstance(v, str):
+            encrypted_value = encrypt_value(v, master_key)
+            encrypted_litellm_params[k] = base64.b64encode(encrypted_value).decode(
+                "utf-8"
+            )
+    db_model = DBModel(
+        model_id=deployment.model_info.id,
+        model_name="gpt-3.5-turbo",
+        litellm_params=encrypted_litellm_params,
+        model_info={"id": deployment.model_info.id},
+    )
+
+    db_models = []
+    deleted_deployments = await pc._delete_deployment(db_models=db_models)
+    assert deleted_deployments == 0
+
+    assert init_len_list == len(llm_router.model_list)
+
+
 litellm_params = LiteLLM_Params(
     model="azure/chatgpt-v-2",
     api_key=os.getenv("AZURE_API_KEY"),
diff --git a/tests/local_testing/test_router_pattern_matching.py b/tests/local_testing/test_router_pattern_matching.py
index 914e8ecfa9..a7ef7df2bc 100644
--- a/tests/local_testing/test_router_pattern_matching.py
+++ b/tests/local_testing/test_router_pattern_matching.py
@@ -133,7 +133,7 @@ def test_route_with_multiple_matching_patterns():
     router.add_pattern("openai/*", deployment1.to_json(exclude_none=True))
     router.add_pattern("openai/gpt-*", deployment2.to_json(exclude_none=True))
     assert router.route("openai/gpt-3.5-turbo") == [
-        deployment1.to_json(exclude_none=True)
+        deployment2.to_json(exclude_none=True)
     ]
 
 
@@ -237,3 +237,79 @@ def test_router_pattern_match_e2e():
             "model": "gpt-4o",
             "messages": [{"role": "user", "content": "Hello, how are you?"}],
         }
+
+
+def test_pattern_matching_router_with_default_wildcard():
+    """
+    Tests that the router returns the default wildcard model when the pattern is not found
+
+    Make sure generic '*' allows all models to be passed through.
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "*",
+                "litellm_params": {"model": "*"},
+                "model_info": {"access_groups": ["default"]},
+            },
+            {
+                "model_name": "anthropic-claude",
+                "litellm_params": {"model": "anthropic/claude-3-5-sonnet"},
+            },
+        ]
+    )
+
+    assert len(router.pattern_router.patterns) > 0
+
+    router.completion(
+        model="gpt-3.5-turbo",
+        messages=[{"role": "user", "content": "Hello, how are you?"}],
+    )
+
+
+def test_pattern_matching_router_with_default_wildcard_and_model_wildcard():
+    """
+    Match to more specific pattern first.
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "*",
+                "litellm_params": {"model": "*"},
+                "model_info": {"access_groups": ["default"]},
+            },
+            {
+                "model_name": "llmengine/*",
+                "litellm_params": {"model": "openai/*"},
+            },
+        ]
+    )
+
+    assert len(router.pattern_router.patterns) > 0
+
+    pattern_router = router.pattern_router
+    deployments = pattern_router.route("llmengine/gpt-3.5-turbo")
+    assert len(deployments) == 1
+    assert deployments[0]["model_name"] == "llmengine/*"
+
+
+def test_sorted_patterns():
+    """
+    Tests that the pattern specificity is calculated correctly
+    """
+    from litellm.router_utils.pattern_match_deployments import PatternUtils
+
+    sorted_patterns = PatternUtils.sorted_patterns(
+        {
+            "llmengine/*": [{"model_name": "anthropic/claude-3-5-sonnet"}],
+            "*": [{"model_name": "openai/*"}],
+        },
+    )
+    assert sorted_patterns[0][0] == "llmengine/*"
+
+
+def test_calculate_pattern_specificity():
+    from litellm.router_utils.pattern_match_deployments import PatternUtils
+
+    assert PatternUtils.calculate_pattern_specificity("llmengine/*") == (11, 1)
+    assert PatternUtils.calculate_pattern_specificity("*") == (1, 1)
diff --git a/tests/proxy_unit_tests/test_proxy_token_counter.py b/tests/proxy_unit_tests/test_proxy_token_counter.py
index 859ddf5c74..11dededd6c 100644
--- a/tests/proxy_unit_tests/test_proxy_token_counter.py
+++ b/tests/proxy_unit_tests/test_proxy_token_counter.py
@@ -63,8 +63,8 @@ async def test_vLLM_token_counting():
     print("response: ", response)
 
     assert (
-        response.tokenizer_type == "huggingface_tokenizer"
-    )  # SHOULD use the hugging face tokenizer
+        response.tokenizer_type == "openai_tokenizer"
+    )  # SHOULD use the default tokenizer
     assert response.model_used == "wolfram/miquliz-120b-v2.0"