(Refactor) /v1/messages to follow simpler logic for Anthropic API spec (#9013)

* anthropic_messages_handler v0 * fix /messages * working messages with router methods * test_anthropic_messages_handler_litellm_router_non_streaming * test_anthropic_messages_litellm_router_non_streaming_with_logging * AnthropicMessagesConfig * _handle_anthropic_messages_response_logging * working with /v1/messages endpoint * working /v1/messages endpoint * refactor to use router factory function * use aanthropic_messages * use BaseConfig for Anthropic /v1/messages * track api key, team on /v1/messages endpoint * fix get_logging_payload * BaseAnthropicMessagesTest * align test config * test_anthropic_messages_with_thinking * test_anthropic_streaming_with_thinking * fix - display anthropic url for debugging * test_bad_request_error_handling * test_anthropic_messages_router_streaming_with_bad_request * fix ProxyException * test_bad_request_error_handling_streaming * use provider_specific_header * test_anthropic_messages_with_extra_headers * test_anthropic_messages_to_wildcard_model * fix gcs pub sub test * standard_logging_payload * fix unit testing for anthopic /v1/messages support * fix pass through anthropic messages api * delete dead code * fix anthropic pass through response * revert change to spend tracking utils * fix get_litellm_metadata_from_kwargs * fix spend logs payload json * proxy_pass_through_endpoint_tests * TestAnthropicPassthroughBasic * fix pass through tests * test_async_vertex_proxy_route_api_key_auth * _handle_anthropic_messages_response_logging * vertex_credentials * test_set_default_vertex_config * test_anthropic_messages_litellm_router_non_streaming_with_logging * test_ageneric_api_call_with_fallbacks_basic * test__aadapter_completion
2025-04-26 11:14:04 +00:00 · 2025-03-06 00:43:08 -08:00 · 2025-03-06 00:43:08 -08:00 · f47987e673
commit f47987e673
parent 31c5ea74ab
25 changed files with 1581 additions and 1027 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1935,12 +1935,12 @@ jobs:
            pip install prisma
            pip install fastapi
            pip install jsonschema
-            pip install "httpx==0.24.1"
+            pip install "httpx==0.27.0"
            pip install "anyio==3.7.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
            pip install "google-cloud-aiplatform==1.59.0"
-            pip install "anthropic==0.21.3"
+            pip install "anthropic==0.49.0"
      # Run pytest and generate JUnit XML report
      - run:
          name: Build Docker image
--- a/litellm/init.py
+++ b/litellm/init.py
@ -800,9 +800,6 @@ from .llms.oobabooga.chat.transformation import OobaboogaConfig
 from .llms.maritalk import MaritalkConfig
 from .llms.openrouter.chat.transformation import OpenrouterConfig
 from .llms.anthropic.chat.transformation import AnthropicConfig
 from .llms.anthropic.experimental_pass_through.transformation import (
    AnthropicExperimentalPassThroughConfig,
 )
 from .llms.groq.stt.transformation import GroqSTTConfig
 from .llms.anthropic.completion.transformation import AnthropicTextConfig
 from .llms.triton.completion.transformation import TritonConfig
@ -821,6 +818,9 @@ from .llms.infinity.rerank.transformation import InfinityRerankConfig
 from .llms.jina_ai.rerank.transformation import JinaAIRerankConfig
 from .llms.clarifai.chat.transformation import ClarifaiConfig
 from .llms.ai21.chat.transformation import AI21ChatConfig, AI21ChatConfig as AI21Config
 from .llms.anthropic.experimental_pass_through.messages.transformation import (
    AnthropicMessagesConfig,
 )
 from .llms.together_ai.chat import TogetherAIConfig
 from .llms.together_ai.completion.transformation import TogetherAITextCompletionConfig
 from .llms.cloudflare.chat.transformation import CloudflareChatConfig
@ -1011,6 +1011,7 @@ from .assistants.main import *
 from .batches.main import *
 from .batch_completion.main import *  # type: ignore
 from .rerank_api.main import *
 from .llms.anthropic.experimental_pass_through.messages.handler import *
 from .realtime_api.main import _arealtime
 from .fine_tuning.main import *
 from .files.main import *
--- a/litellm/adapters/anthropic_adapter.py
+++ b/litellm/adapters/anthropic_adapter.py
@ -1,186 +0,0 @@
 # What is this?
 ## Translates OpenAI call to Anthropic `/v1/messages` format
 import traceback
 from typing import Any, Optional
 import litellm
 from litellm import ChatCompletionRequest, verbose_logger
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.types.llms.anthropic import AnthropicMessagesRequest, AnthropicResponse
 from litellm.types.utils import AdapterCompletionStreamWrapper, ModelResponse
 class AnthropicAdapter(CustomLogger):
    def __init__(self) -> None:
        super().__init__()
    def translate_completion_input_params(
        self, kwargs
    ) -> Optional[ChatCompletionRequest]:
        """
        - translate params, where needed
        - pass rest, as is
        """
        request_body = AnthropicMessagesRequest(**kwargs)  # type: ignore
        translated_body = litellm.AnthropicExperimentalPassThroughConfig().translate_anthropic_to_openai(
            anthropic_message_request=request_body
        )
        return translated_body
    def translate_completion_output_params(
        self, response: ModelResponse
    ) -> Optional[AnthropicResponse]:
        return litellm.AnthropicExperimentalPassThroughConfig().translate_openai_response_to_anthropic(
            response=response
        )
    def translate_completion_output_params_streaming(
        self, completion_stream: Any
    ) -> AdapterCompletionStreamWrapper | None:
        return AnthropicStreamWrapper(completion_stream=completion_stream)
 anthropic_adapter = AnthropicAdapter()
 class AnthropicStreamWrapper(AdapterCompletionStreamWrapper):
    """
    - first chunk return 'message_start'
    - content block must be started and stopped
    - finish_reason must map exactly to anthropic reason, else anthropic client won't be able to parse it.
    """
    sent_first_chunk: bool = False
    sent_content_block_start: bool = False
    sent_content_block_finish: bool = False
    sent_last_message: bool = False
    holding_chunk: Optional[Any] = None
    def __next__(self):
        try:
            if self.sent_first_chunk is False:
                self.sent_first_chunk = True
                return {
                    "type": "message_start",
                    "message": {
                        "id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
                        "type": "message",
                        "role": "assistant",
                        "content": [],
                        "model": "claude-3-5-sonnet-20240620",
                        "stop_reason": None,
                        "stop_sequence": None,
                        "usage": {"input_tokens": 25, "output_tokens": 1},
                    },
                }
            if self.sent_content_block_start is False:
                self.sent_content_block_start = True
                return {
                    "type": "content_block_start",
                    "index": 0,
                    "content_block": {"type": "text", "text": ""},
                }
            for chunk in self.completion_stream:
                if chunk == "None" or chunk is None:
                    raise Exception
                processed_chunk = litellm.AnthropicExperimentalPassThroughConfig().translate_streaming_openai_response_to_anthropic(
                    response=chunk
                )
                if (
                    processed_chunk["type"] == "message_delta"
                    and self.sent_content_block_finish is False
                ):
                    self.holding_chunk = processed_chunk
                    self.sent_content_block_finish = True
                    return {
                        "type": "content_block_stop",
                        "index": 0,
                    }
                elif self.holding_chunk is not None:
                    return_chunk = self.holding_chunk
                    self.holding_chunk = processed_chunk
                    return return_chunk
                else:
                    return processed_chunk
            if self.holding_chunk is not None:
                return_chunk = self.holding_chunk
                self.holding_chunk = None
                return return_chunk
            if self.sent_last_message is False:
                self.sent_last_message = True
                return {"type": "message_stop"}
            raise StopIteration
        except StopIteration:
            if self.sent_last_message is False:
                self.sent_last_message = True
                return {"type": "message_stop"}
            raise StopIteration
        except Exception as e:
            verbose_logger.error(
                "Anthropic Adapter - {}\n{}".format(e, traceback.format_exc())
            )
    async def __anext__(self):
        try:
            if self.sent_first_chunk is False:
                self.sent_first_chunk = True
                return {
                    "type": "message_start",
                    "message": {
                        "id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
                        "type": "message",
                        "role": "assistant",
                        "content": [],
                        "model": "claude-3-5-sonnet-20240620",
                        "stop_reason": None,
                        "stop_sequence": None,
                        "usage": {"input_tokens": 25, "output_tokens": 1},
                    },
                }
            if self.sent_content_block_start is False:
                self.sent_content_block_start = True
                return {
                    "type": "content_block_start",
                    "index": 0,
                    "content_block": {"type": "text", "text": ""},
                }
            async for chunk in self.completion_stream:
                if chunk == "None" or chunk is None:
                    raise Exception
                processed_chunk = litellm.AnthropicExperimentalPassThroughConfig().translate_streaming_openai_response_to_anthropic(
                    response=chunk
                )
                if (
                    processed_chunk["type"] == "message_delta"
                    and self.sent_content_block_finish is False
                ):
                    self.holding_chunk = processed_chunk
                    self.sent_content_block_finish = True
                    return {
                        "type": "content_block_stop",
                        "index": 0,
                    }
                elif self.holding_chunk is not None:
                    return_chunk = self.holding_chunk
                    self.holding_chunk = processed_chunk
                    return return_chunk
                else:
                    return processed_chunk
            if self.holding_chunk is not None:
                return_chunk = self.holding_chunk
                self.holding_chunk = None
                return return_chunk
            if self.sent_last_message is False:
                self.sent_last_message = True
                return {"type": "message_stop"}
            raise StopIteration
        except StopIteration:
            if self.sent_last_message is False:
                self.sent_last_message = True
                return {"type": "message_stop"}
            raise StopAsyncIteration
--- a/litellm/litellm_core_utils/core_helpers.py
+++ b/litellm/litellm_core_utils/core_helpers.py
@ -73,6 +73,8 @@ def remove_index_from_tool_calls(
 def get_litellm_metadata_from_kwargs(kwargs: dict):
    """
    Helper to get litellm metadata from all litellm request kwargs
    Return `litellm_metadata` if it exists, otherwise return `metadata`
    """
    litellm_params = kwargs.get("litellm_params", {})
    if litellm_params:
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -932,6 +932,9 @@ class Logging(LiteLLMLoggingBaseClass):
            self.model_call_details["log_event_type"] = "successful_api_call"
            self.model_call_details["end_time"] = end_time
            self.model_call_details["cache_hit"] = cache_hit
            if self.call_type == CallTypes.anthropic_messages.value:
                result = self._handle_anthropic_messages_response_logging(result=result)
            ## if model in model cost map - log the response cost
            ## else set cost to None
            if (
@ -2304,6 +2307,37 @@ class Logging(LiteLLMLoggingBaseClass):
            return complete_streaming_response
        return None
    def _handle_anthropic_messages_response_logging(self, result: Any) -> ModelResponse:
        """
        Handles logging for Anthropic messages responses.
        Args:
            result: The response object from the model call
        Returns:
            The the response object from the model call
        - For Non-streaming responses, we need to transform the response to a ModelResponse object.
        - For streaming responses, anthropic_messages handler calls success_handler with a assembled ModelResponse.
        """
        if self.stream and isinstance(result, ModelResponse):
            return result
        result = litellm.AnthropicConfig().transform_response(
            raw_response=self.model_call_details["httpx_response"],
            model_response=litellm.ModelResponse(),
            model=self.model,
            messages=[],
            logging_obj=self,
            optional_params={},
            api_key="",
            request_data={},
            encoding=litellm.encoding,
            json_mode=False,
            litellm_params={},
        )
        return result
 def set_callbacks(callback_list, function_id=None):  # noqa: PLR0915
    """
--- a/litellm/llms/anthropic/experimental_pass_through/messages/handler.py
+++ b/litellm/llms/anthropic/experimental_pass_through/messages/handler.py
@ -0,0 +1,179 @@
 """
 - call /messages on Anthropic API
 - Make streaming + non-streaming request - just pass it through direct to Anthropic. No need to do anything special here 
 - Ensure requests are logged in the DB - stream + non-stream
 """
 import json
 from typing import Any, AsyncIterator, Dict, Optional, Union, cast
 import httpx
 import litellm
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.base_llm.anthropic_messages.transformation import (
    BaseAnthropicMessagesConfig,
 )
 from litellm.llms.custom_httpx.http_handler import (
    AsyncHTTPHandler,
    get_async_httpx_client,
 )
 from litellm.types.router import GenericLiteLLMParams
 from litellm.types.utils import ProviderSpecificHeader
 from litellm.utils import ProviderConfigManager, client
 class AnthropicMessagesHandler:
    @staticmethod
    async def _handle_anthropic_streaming(
        response: httpx.Response,
        request_body: dict,
        litellm_logging_obj: LiteLLMLoggingObj,
    ) -> AsyncIterator:
        """Helper function to handle Anthropic streaming responses using the existing logging handlers"""
        from datetime import datetime
        from litellm.proxy.pass_through_endpoints.streaming_handler import (
            PassThroughStreamingHandler,
        )
        from litellm.proxy.pass_through_endpoints.success_handler import (
            PassThroughEndpointLogging,
        )
        from litellm.proxy.pass_through_endpoints.types import EndpointType
        # Create success handler object
        passthrough_success_handler_obj = PassThroughEndpointLogging()
        # Use the existing streaming handler for Anthropic
        start_time = datetime.now()
        return PassThroughStreamingHandler.chunk_processor(
            response=response,
            request_body=request_body,
            litellm_logging_obj=litellm_logging_obj,
            endpoint_type=EndpointType.ANTHROPIC,
            start_time=start_time,
            passthrough_success_handler_obj=passthrough_success_handler_obj,
            url_route="/v1/messages",
        )
@client
 async def anthropic_messages(
    api_key: str,
    model: str,
    stream: bool = False,
    api_base: Optional[str] = None,
    client: Optional[AsyncHTTPHandler] = None,
    custom_llm_provider: Optional[str] = None,
    **kwargs,
 ) -> Union[Dict[str, Any], AsyncIterator]:
    """
    Makes Anthropic `/v1/messages` API calls In the Anthropic API Spec
    """
    # Use provided client or create a new one
    optional_params = GenericLiteLLMParams(**kwargs)
    model, _custom_llm_provider, dynamic_api_key, dynamic_api_base = (
        litellm.get_llm_provider(
            model=model,
            custom_llm_provider=custom_llm_provider,
            api_base=optional_params.api_base,
            api_key=optional_params.api_key,
        )
    )
    anthropic_messages_provider_config: Optional[BaseAnthropicMessagesConfig] = (
        ProviderConfigManager.get_provider_anthropic_messages_config(
            model=model,
            provider=litellm.LlmProviders(_custom_llm_provider),
        )
    )
    if anthropic_messages_provider_config is None:
        raise ValueError(
            f"Anthropic messages provider config not found for model: {model}"
        )
    if client is None or not isinstance(client, AsyncHTTPHandler):
        async_httpx_client = get_async_httpx_client(
            llm_provider=litellm.LlmProviders.ANTHROPIC
        )
    else:
        async_httpx_client = client
    litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
    # Prepare headers
    provider_specific_header = cast(
        Optional[ProviderSpecificHeader], kwargs.get("provider_specific_header", None)
    )
    extra_headers = (
        provider_specific_header.get("extra_headers", {})
        if provider_specific_header
        else {}
    )
    headers = anthropic_messages_provider_config.validate_environment(
        headers=extra_headers or {},
        model=model,
        api_key=api_key,
    )
    litellm_logging_obj.update_environment_variables(
        model=model,
        optional_params=dict(optional_params),
        litellm_params={
            "metadata": kwargs.get("metadata", {}),
            "preset_cache_key": None,
            "stream_response": {},
            **optional_params.model_dump(exclude_unset=True),
        },
        custom_llm_provider=_custom_llm_provider,
    )
    litellm_logging_obj.model_call_details.update(kwargs)
    # Prepare request body
    request_body = kwargs.copy()
    request_body = {
        k: v
        for k, v in request_body.items()
        if k
        in anthropic_messages_provider_config.get_supported_anthropic_messages_params(
            model=model
        )
    }
    request_body["stream"] = stream
    request_body["model"] = model
    litellm_logging_obj.stream = stream
    # Make the request
    request_url = anthropic_messages_provider_config.get_complete_url(
        api_base=api_base, model=model
    )
    litellm_logging_obj.pre_call(
        input=[{"role": "user", "content": json.dumps(request_body)}],
        api_key="",
        additional_args={
            "complete_input_dict": request_body,
            "api_base": str(request_url),
            "headers": headers,
        },
    )
    response = await async_httpx_client.post(
        url=request_url,
        headers=headers,
        data=json.dumps(request_body),
        stream=stream,
    )
    response.raise_for_status()
    # used for logging + cost tracking
    litellm_logging_obj.model_call_details["httpx_response"] = response
    if stream:
        return await AnthropicMessagesHandler._handle_anthropic_streaming(
            response=response,
            request_body=request_body,
            litellm_logging_obj=litellm_logging_obj,
        )
    else:
        return response.json()
--- a/litellm/llms/anthropic/experimental_pass_through/messages/transformation.py
+++ b/litellm/llms/anthropic/experimental_pass_through/messages/transformation.py
@ -0,0 +1,47 @@
 from typing import Optional
 from litellm.llms.base_llm.anthropic_messages.transformation import (
    BaseAnthropicMessagesConfig,
 )
 DEFAULT_ANTHROPIC_API_BASE = "https://api.anthropic.com"
 DEFAULT_ANTHROPIC_API_VERSION = "2023-06-01"
 class AnthropicMessagesConfig(BaseAnthropicMessagesConfig):
    def get_supported_anthropic_messages_params(self, model: str) -> list:
        return [
            "messages",
            "model",
            "system",
            "max_tokens",
            "stop_sequences",
            "temperature",
            "top_p",
            "top_k",
            "tools",
            "tool_choice",
            "thinking",
            # TODO: Add Anthropic `metadata` support
            # "metadata",
        ]
    def get_complete_url(self, api_base: Optional[str], model: str) -> str:
        api_base = api_base or DEFAULT_ANTHROPIC_API_BASE
        if not api_base.endswith("/v1/messages"):
            api_base = f"{api_base}/v1/messages"
        return api_base
    def validate_environment(
        self,
        headers: dict,
        model: str,
        api_key: Optional[str] = None,
    ) -> dict:
        if "x-api-key" not in headers:
            headers["x-api-key"] = api_key
        if "anthropic-version" not in headers:
            headers["anthropic-version"] = DEFAULT_ANTHROPIC_API_VERSION
        if "content-type" not in headers:
            headers["content-type"] = "application/json"
        return headers
--- a/litellm/llms/anthropic/experimental_pass_through/transformation.py
+++ b/litellm/llms/anthropic/experimental_pass_through/transformation.py
@ -1,412 +0,0 @@
 import json
 from typing import List, Literal, Optional, Tuple, Union
 from openai.types.chat.chat_completion_chunk import Choice as OpenAIStreamingChoice
 from litellm.types.llms.anthropic import (
    AllAnthropicToolsValues,
    AnthopicMessagesAssistantMessageParam,
    AnthropicFinishReason,
    AnthropicMessagesRequest,
    AnthropicMessagesToolChoice,
    AnthropicMessagesUserMessageParam,
    AnthropicResponse,
    AnthropicResponseContentBlockText,
    AnthropicResponseContentBlockToolUse,
    AnthropicResponseUsageBlock,
    ContentBlockDelta,
    ContentJsonBlockDelta,
    ContentTextBlockDelta,
    MessageBlockDelta,
    MessageDelta,
    UsageDelta,
 )
 from litellm.types.llms.openai import (
    AllMessageValues,
    ChatCompletionAssistantMessage,
    ChatCompletionAssistantToolCall,
    ChatCompletionImageObject,
    ChatCompletionImageUrlObject,
    ChatCompletionRequest,
    ChatCompletionSystemMessage,
    ChatCompletionTextObject,
    ChatCompletionToolCallFunctionChunk,
    ChatCompletionToolChoiceFunctionParam,
    ChatCompletionToolChoiceObjectParam,
    ChatCompletionToolChoiceValues,
    ChatCompletionToolMessage,
    ChatCompletionToolParam,
    ChatCompletionToolParamFunctionChunk,
    ChatCompletionUserMessage,
 )
 from litellm.types.utils import Choices, ModelResponse, Usage
 class AnthropicExperimentalPassThroughConfig:
    def __init__(self):
        pass
    ### FOR [BETA] `/v1/messages` endpoint support
    def translatable_anthropic_params(self) -> List:
        """
        Which anthropic params, we need to translate to the openai format.
        """
        return ["messages", "metadata", "system", "tool_choice", "tools"]
    def translate_anthropic_messages_to_openai(  # noqa: PLR0915
        self,
        messages: List[
            Union[
                AnthropicMessagesUserMessageParam,
                AnthopicMessagesAssistantMessageParam,
            ]
        ],
    ) -> List:
        new_messages: List[AllMessageValues] = []
        for m in messages:
            user_message: Optional[ChatCompletionUserMessage] = None
            tool_message_list: List[ChatCompletionToolMessage] = []
            new_user_content_list: List[
                Union[ChatCompletionTextObject, ChatCompletionImageObject]
            ] = []
            ## USER MESSAGE ##
            if m["role"] == "user":
                ## translate user message
                message_content = m.get("content")
                if message_content and isinstance(message_content, str):
                    user_message = ChatCompletionUserMessage(
                        role="user", content=message_content
                    )
                elif message_content and isinstance(message_content, list):
                    for content in message_content:
                        if content["type"] == "text":
                            text_obj = ChatCompletionTextObject(
                                type="text", text=content["text"]
                            )
                            new_user_content_list.append(text_obj)
                        elif content["type"] == "image":
                            image_url = ChatCompletionImageUrlObject(
                                url=f"data:{content['type']};base64,{content['source']}"
                            )
                            image_obj = ChatCompletionImageObject(
                                type="image_url", image_url=image_url
                            )
                            new_user_content_list.append(image_obj)
                        elif content["type"] == "tool_result":
                            if "content" not in content:
                                tool_result = ChatCompletionToolMessage(
                                    role="tool",
                                    tool_call_id=content["tool_use_id"],
                                    content="",
                                )
                                tool_message_list.append(tool_result)
                            elif isinstance(content["content"], str):
                                tool_result = ChatCompletionToolMessage(
                                    role="tool",
                                    tool_call_id=content["tool_use_id"],
                                    content=content["content"],
                                )
                                tool_message_list.append(tool_result)
                            elif isinstance(content["content"], list):
                                for c in content["content"]:
                                    if c["type"] == "text":
                                        tool_result = ChatCompletionToolMessage(
                                            role="tool",
                                            tool_call_id=content["tool_use_id"],
                                            content=c["text"],
                                        )
                                        tool_message_list.append(tool_result)
                                    elif c["type"] == "image":
                                        image_str = (
                                            f"data:{c['type']};base64,{c['source']}"
                                        )
                                        tool_result = ChatCompletionToolMessage(
                                            role="tool",
                                            tool_call_id=content["tool_use_id"],
                                            content=image_str,
                                        )
                                        tool_message_list.append(tool_result)
            if user_message is not None:
                new_messages.append(user_message)
            if len(new_user_content_list) > 0:
                new_messages.append({"role": "user", "content": new_user_content_list})  # type: ignore
            if len(tool_message_list) > 0:
                new_messages.extend(tool_message_list)
            ## ASSISTANT MESSAGE ##
            assistant_message_str: Optional[str] = None
            tool_calls: List[ChatCompletionAssistantToolCall] = []
            if m["role"] == "assistant":
                if isinstance(m["content"], str):
                    assistant_message_str = m["content"]
                elif isinstance(m["content"], list):
                    for content in m["content"]:
                        if content["type"] == "text":
                            if assistant_message_str is None:
                                assistant_message_str = content["text"]
                            else:
                                assistant_message_str += content["text"]
                        elif content["type"] == "tool_use":
                            function_chunk = ChatCompletionToolCallFunctionChunk(
                                name=content["name"],
                                arguments=json.dumps(content["input"]),
                            )
                            tool_calls.append(
                                ChatCompletionAssistantToolCall(
                                    id=content["id"],
                                    type="function",
                                    function=function_chunk,
                                )
                            )
            if assistant_message_str is not None or len(tool_calls) > 0:
                assistant_message = ChatCompletionAssistantMessage(
                    role="assistant",
                    content=assistant_message_str,
                )
                if len(tool_calls) > 0:
                    assistant_message["tool_calls"] = tool_calls
                new_messages.append(assistant_message)
        return new_messages
    def translate_anthropic_tool_choice_to_openai(
        self, tool_choice: AnthropicMessagesToolChoice
    ) -> ChatCompletionToolChoiceValues:
        if tool_choice["type"] == "any":
            return "required"
        elif tool_choice["type"] == "auto":
            return "auto"
        elif tool_choice["type"] == "tool":
            tc_function_param = ChatCompletionToolChoiceFunctionParam(
                name=tool_choice.get("name", "")
            )
            return ChatCompletionToolChoiceObjectParam(
                type="function", function=tc_function_param
            )
        else:
            raise ValueError(
                "Incompatible tool choice param submitted - {}".format(tool_choice)
            )
    def translate_anthropic_tools_to_openai(
        self, tools: List[AllAnthropicToolsValues]
    ) -> List[ChatCompletionToolParam]:
        new_tools: List[ChatCompletionToolParam] = []
        mapped_tool_params = ["name", "input_schema", "description"]
        for tool in tools:
            function_chunk = ChatCompletionToolParamFunctionChunk(
                name=tool["name"],
            )
            if "input_schema" in tool:
                function_chunk["parameters"] = tool["input_schema"]  # type: ignore
            if "description" in tool:
                function_chunk["description"] = tool["description"]  # type: ignore
            for k, v in tool.items():
                if k not in mapped_tool_params:  # pass additional computer kwargs
                    function_chunk.setdefault("parameters", {}).update({k: v})
            new_tools.append(
                ChatCompletionToolParam(type="function", function=function_chunk)
            )
        return new_tools
    def translate_anthropic_to_openai(
        self, anthropic_message_request: AnthropicMessagesRequest
    ) -> ChatCompletionRequest:
        """
        This is used by the beta Anthropic Adapter, for translating anthropic `/v1/messages` requests to the openai format.
        """
        new_messages: List[AllMessageValues] = []
        ## CONVERT ANTHROPIC MESSAGES TO OPENAI
        new_messages = self.translate_anthropic_messages_to_openai(
            messages=anthropic_message_request["messages"]
        )
        ## ADD SYSTEM MESSAGE TO MESSAGES
        if "system" in anthropic_message_request:
            new_messages.insert(
                0,
                ChatCompletionSystemMessage(
                    role="system", content=anthropic_message_request["system"]
                ),
            )
        new_kwargs: ChatCompletionRequest = {
            "model": anthropic_message_request["model"],
            "messages": new_messages,
        }
        ## CONVERT METADATA (user_id)
        if "metadata" in anthropic_message_request:
            if "user_id" in anthropic_message_request["metadata"]:
                new_kwargs["user"] = anthropic_message_request["metadata"]["user_id"]
        # Pass litellm proxy specific metadata
        if "litellm_metadata" in anthropic_message_request:
            # metadata will be passed to litellm.acompletion(), it's a litellm_param
            new_kwargs["metadata"] = anthropic_message_request.pop("litellm_metadata")
        ## CONVERT TOOL CHOICE
        if "tool_choice" in anthropic_message_request:
            new_kwargs["tool_choice"] = self.translate_anthropic_tool_choice_to_openai(
                tool_choice=anthropic_message_request["tool_choice"]
            )
        ## CONVERT TOOLS
        if "tools" in anthropic_message_request:
            new_kwargs["tools"] = self.translate_anthropic_tools_to_openai(
                tools=anthropic_message_request["tools"]
            )
        translatable_params = self.translatable_anthropic_params()
        for k, v in anthropic_message_request.items():
            if k not in translatable_params:  # pass remaining params as is
                new_kwargs[k] = v  # type: ignore
        return new_kwargs
    def _translate_openai_content_to_anthropic(
        self, choices: List[Choices]
    ) -> List[
        Union[AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse]
    ]:
        new_content: List[
            Union[
                AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse
            ]
        ] = []
        for choice in choices:
            if (
                choice.message.tool_calls is not None
                and len(choice.message.tool_calls) > 0
            ):
                for tool_call in choice.message.tool_calls:
                    new_content.append(
                        AnthropicResponseContentBlockToolUse(
                            type="tool_use",
                            id=tool_call.id,
                            name=tool_call.function.name or "",
                            input=json.loads(tool_call.function.arguments),
                        )
                    )
            elif choice.message.content is not None:
                new_content.append(
                    AnthropicResponseContentBlockText(
                        type="text", text=choice.message.content
                    )
                )
        return new_content
    def _translate_openai_finish_reason_to_anthropic(
        self, openai_finish_reason: str
    ) -> AnthropicFinishReason:
        if openai_finish_reason == "stop":
            return "end_turn"
        elif openai_finish_reason == "length":
            return "max_tokens"
        elif openai_finish_reason == "tool_calls":
            return "tool_use"
        return "end_turn"
    def translate_openai_response_to_anthropic(
        self, response: ModelResponse
    ) -> AnthropicResponse:
        ## translate content block
        anthropic_content = self._translate_openai_content_to_anthropic(choices=response.choices)  # type: ignore
        ## extract finish reason
        anthropic_finish_reason = self._translate_openai_finish_reason_to_anthropic(
            openai_finish_reason=response.choices[0].finish_reason  # type: ignore
        )
        # extract usage
        usage: Usage = getattr(response, "usage")
        anthropic_usage = AnthropicResponseUsageBlock(
            input_tokens=usage.prompt_tokens or 0,
            output_tokens=usage.completion_tokens or 0,
        )
        translated_obj = AnthropicResponse(
            id=response.id,
            type="message",
            role="assistant",
            model=response.model or "unknown-model",
            stop_sequence=None,
            usage=anthropic_usage,
            content=anthropic_content,
            stop_reason=anthropic_finish_reason,
        )
        return translated_obj
    def _translate_streaming_openai_chunk_to_anthropic(
        self, choices: List[OpenAIStreamingChoice]
    ) -> Tuple[
        Literal["text_delta", "input_json_delta"],
        Union[ContentTextBlockDelta, ContentJsonBlockDelta],
    ]:
        text: str = ""
        partial_json: Optional[str] = None
        for choice in choices:
            if choice.delta.content is not None:
                text += choice.delta.content
            elif choice.delta.tool_calls is not None:
                partial_json = ""
                for tool in choice.delta.tool_calls:
                    if (
                        tool.function is not None
                        and tool.function.arguments is not None
                    ):
                        partial_json += tool.function.arguments
        if partial_json is not None:
            return "input_json_delta", ContentJsonBlockDelta(
                type="input_json_delta", partial_json=partial_json
            )
        else:
            return "text_delta", ContentTextBlockDelta(type="text_delta", text=text)
    def translate_streaming_openai_response_to_anthropic(
        self, response: ModelResponse
    ) -> Union[ContentBlockDelta, MessageBlockDelta]:
        ## base case - final chunk w/ finish reason
        if response.choices[0].finish_reason is not None:
            delta = MessageDelta(
                stop_reason=self._translate_openai_finish_reason_to_anthropic(
                    response.choices[0].finish_reason
                ),
            )
            if getattr(response, "usage", None) is not None:
                litellm_usage_chunk: Optional[Usage] = response.usage  # type: ignore
            elif (
                hasattr(response, "_hidden_params")
                and "usage" in response._hidden_params
            ):
                litellm_usage_chunk = response._hidden_params["usage"]
            else:
                litellm_usage_chunk = None
            if litellm_usage_chunk is not None:
                usage_delta = UsageDelta(
                    input_tokens=litellm_usage_chunk.prompt_tokens or 0,
                    output_tokens=litellm_usage_chunk.completion_tokens or 0,
                )
            else:
                usage_delta = UsageDelta(input_tokens=0, output_tokens=0)
            return MessageBlockDelta(
                type="message_delta", delta=delta, usage=usage_delta
            )
        (
            type_of_content,
            content_block_delta,
        ) = self._translate_streaming_openai_chunk_to_anthropic(
            choices=response.choices  # type: ignore
        )
        return ContentBlockDelta(
            type="content_block_delta",
            index=response.choices[0].index,
            delta=content_block_delta,
        )
--- a/litellm/llms/base_llm/anthropic_messages/transformation.py
+++ b/litellm/llms/base_llm/anthropic_messages/transformation.py
@ -0,0 +1,35 @@
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any, Optional
 if TYPE_CHECKING:
    from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
    LiteLLMLoggingObj = _LiteLLMLoggingObj
 else:
    LiteLLMLoggingObj = Any
 class BaseAnthropicMessagesConfig(ABC):
    @abstractmethod
    def validate_environment(
        self,
        headers: dict,
        model: str,
        api_key: Optional[str] = None,
    ) -> dict:
        pass
    @abstractmethod
    def get_complete_url(self, api_base: Optional[str], model: str) -> str:
        """
        OPTIONAL
        Get the complete url for the request
        Some providers need `model` in `api_base`
        """
        return api_base or ""
    @abstractmethod
    def get_supported_anthropic_messages_params(self, model: str) -> list:
        pass
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -1963,7 +1963,7 @@ class ProxyException(Exception):
        code: Optional[Union[int, str]] = None,
        headers: Optional[Dict[str, str]] = None,
    ):
-        self.message = message
+        self.message = str(message)
        self.type = type
        self.param = param
--- a/litellm/proxy/anthropic_endpoints/endpoints.py
+++ b/litellm/proxy/anthropic_endpoints/endpoints.py
@ -0,0 +1,252 @@
 """
 Unified /v1/messages endpoint - (Anthropic Spec)
 """
 import asyncio
 import json
 import time
 import traceback
 from fastapi import APIRouter, Depends, HTTPException, Request, Response, status
 from fastapi.responses import StreamingResponse
 import litellm
 from litellm._logging import verbose_proxy_logger
 from litellm.proxy._types import *
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.proxy.common_utils.http_parsing_utils import _read_request_body
 from litellm.proxy.litellm_pre_call_utils import add_litellm_data_to_request
 from litellm.proxy.utils import ProxyLogging
 router = APIRouter()
 async def async_data_generator_anthropic(
    response,
    user_api_key_dict: UserAPIKeyAuth,
    request_data: dict,
    proxy_logging_obj: ProxyLogging,
 ):
    verbose_proxy_logger.debug("inside generator")
    try:
        time.time()
        async for chunk in response:
            verbose_proxy_logger.debug(
                "async_data_generator: received streaming chunk - {}".format(chunk)
            )
            ### CALL HOOKS ### - modify outgoing data
            chunk = await proxy_logging_obj.async_post_call_streaming_hook(
                user_api_key_dict=user_api_key_dict, response=chunk
            )
            yield chunk
    except Exception as e:
        verbose_proxy_logger.exception(
            "litellm.proxy.proxy_server.async_data_generator(): Exception occured - {}".format(
                str(e)
            )
        )
        await proxy_logging_obj.post_call_failure_hook(
            user_api_key_dict=user_api_key_dict,
            original_exception=e,
            request_data=request_data,
        )
        verbose_proxy_logger.debug(
            f"\033[1;31mAn error occurred: {e}\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`"
        )
        if isinstance(e, HTTPException):
            raise e
        else:
            error_traceback = traceback.format_exc()
            error_msg = f"{str(e)}\n\n{error_traceback}"
        proxy_exception = ProxyException(
            message=getattr(e, "message", error_msg),
            type=getattr(e, "type", "None"),
            param=getattr(e, "param", "None"),
            code=getattr(e, "status_code", 500),
        )
        error_returned = json.dumps({"error": proxy_exception.to_dict()})
        yield f"data: {error_returned}\n\n"
@router.post(
    "/v1/messages",
    tags=["[beta] Anthropic `/v1/messages`"],
    dependencies=[Depends(user_api_key_auth)],
    include_in_schema=False,
 )
 async def anthropic_response(  # noqa: PLR0915
    fastapi_response: Response,
    request: Request,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    """
    Use `{PROXY_BASE_URL}/anthropic/v1/messages` instead - [Docs](https://docs.litellm.ai/docs/anthropic_completion).
    This was a BETA endpoint that calls 100+ LLMs in the anthropic format.
    """
    from litellm.proxy.proxy_server import (
        general_settings,
        get_custom_headers,
        llm_router,
        proxy_config,
        proxy_logging_obj,
        user_api_base,
        user_max_tokens,
        user_model,
        user_request_timeout,
        user_temperature,
        version,
    )
    request_data = await _read_request_body(request=request)
    data: dict = {**request_data}
    try:
        data["model"] = (
            general_settings.get("completion_model", None)  # server default
            or user_model  # model name passed via cli args
            or data.get("model", None)  # default passed in http request
        )
        if user_model:
            data["model"] = user_model
        data = await add_litellm_data_to_request(
            data=data,  # type: ignore
            request=request,
            general_settings=general_settings,
            user_api_key_dict=user_api_key_dict,
            version=version,
            proxy_config=proxy_config,
        )
        # override with user settings, these are params passed via cli
        if user_temperature:
            data["temperature"] = user_temperature
        if user_request_timeout:
            data["request_timeout"] = user_request_timeout
        if user_max_tokens:
            data["max_tokens"] = user_max_tokens
        if user_api_base:
            data["api_base"] = user_api_base
        ### MODEL ALIAS MAPPING ###
        # check if model name in model alias map
        # get the actual model name
        if data["model"] in litellm.model_alias_map:
            data["model"] = litellm.model_alias_map[data["model"]]
        ### CALL HOOKS ### - modify incoming data before calling the model
        data = await proxy_logging_obj.pre_call_hook(  # type: ignore
            user_api_key_dict=user_api_key_dict, data=data, call_type="text_completion"
        )
        ### ROUTE THE REQUESTs ###
        router_model_names = llm_router.model_names if llm_router is not None else []
        # skip router if user passed their key
        if (
            llm_router is not None and data["model"] in router_model_names
        ):  # model in router model list
            llm_response = asyncio.create_task(llm_router.aanthropic_messages(**data))
        elif (
            llm_router is not None
            and llm_router.model_group_alias is not None
            and data["model"] in llm_router.model_group_alias
        ):  # model set in model_group_alias
            llm_response = asyncio.create_task(llm_router.aanthropic_messages(**data))
        elif (
            llm_router is not None and data["model"] in llm_router.deployment_names
        ):  # model in router deployments, calling a specific deployment on the router
            llm_response = asyncio.create_task(
                llm_router.aanthropic_messages(**data, specific_deployment=True)
            )
        elif (
            llm_router is not None and data["model"] in llm_router.get_model_ids()
        ):  # model in router model list
            llm_response = asyncio.create_task(llm_router.aanthropic_messages(**data))
        elif (
            llm_router is not None
            and data["model"] not in router_model_names
            and (
                llm_router.default_deployment is not None
                or len(llm_router.pattern_router.patterns) > 0
            )
        ):  # model in router deployments, calling a specific deployment on the router
            llm_response = asyncio.create_task(llm_router.aanthropic_messages(**data))
        elif user_model is not None:  # `litellm --model <your-model-name>`
            llm_response = asyncio.create_task(litellm.anthropic_messages(**data))
        else:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail={
                    "error": "completion: Invalid model name passed in model="
                    + data.get("model", "")
                },
            )
        # Await the llm_response task
        response = await llm_response
        hidden_params = getattr(response, "_hidden_params", {}) or {}
        model_id = hidden_params.get("model_id", None) or ""
        cache_key = hidden_params.get("cache_key", None) or ""
        api_base = hidden_params.get("api_base", None) or ""
        response_cost = hidden_params.get("response_cost", None) or ""
        ### ALERTING ###
        asyncio.create_task(
            proxy_logging_obj.update_request_status(
                litellm_call_id=data.get("litellm_call_id", ""), status="success"
            )
        )
        verbose_proxy_logger.debug("final response: %s", response)
        fastapi_response.headers.update(
            get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
                api_base=api_base,
                version=version,
                response_cost=response_cost,
                request_data=data,
                hidden_params=hidden_params,
            )
        )
        if (
            "stream" in data and data["stream"] is True
        ):  # use generate_responses to stream responses
            selected_data_generator = async_data_generator_anthropic(
                response=response,
                user_api_key_dict=user_api_key_dict,
                request_data=data,
                proxy_logging_obj=proxy_logging_obj,
            )
            return StreamingResponse(
                selected_data_generator,  # type: ignore
                media_type="text/event-stream",
            )
        verbose_proxy_logger.info("\nResponse from Litellm:\n{}".format(response))
        return response
    except Exception as e:
        await proxy_logging_obj.post_call_failure_hook(
            user_api_key_dict=user_api_key_dict, original_exception=e, request_data=data
        )
        verbose_proxy_logger.exception(
            "litellm.proxy.proxy_server.anthropic_response(): Exception occured - {}".format(
                str(e)
            )
        )
        error_msg = f"{str(e)}"
        raise ProxyException(
            message=getattr(e, "message", error_msg),
            type=getattr(e, "type", "None"),
            param=getattr(e, "param", "None"),
            code=getattr(e, "status_code", 500),
        )
--- a/litellm/proxy/example_config_yaml/pass_through_config.yaml
+++ b/litellm/proxy/example_config_yaml/pass_through_config.yaml
@ -4,6 +4,26 @@ model_list:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
  - model_name: claude-3-5-sonnet-20241022
    litellm_params:
      model: anthropic/claude-3-5-sonnet-20241022
      api_key: os.environ/ANTHROPIC_API_KEY
  - model_name: claude-special-alias
    litellm_params:
      model: anthropic/claude-3-haiku-20240307
      api_key: os.environ/ANTHROPIC_API_KEY
  - model_name: claude-3-5-sonnet-20241022
    litellm_params:
      model: anthropic/claude-3-5-sonnet-20241022
      api_key: os.environ/ANTHROPIC_API_KEY
  - model_name: claude-3-7-sonnet-20250219
    litellm_params:
      model: anthropic/claude-3-7-sonnet-20250219
      api_key: os.environ/ANTHROPIC_API_KEY
  - model_name: anthropic/*
    litellm_params:
      model: anthropic/*
      api_key: os.environ/ANTHROPIC_API_KEY
 general_settings: 
  master_key: sk-1234 
  custom_auth: custom_auth_basic.user_api_key_auth
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -4,7 +4,22 @@ model_list:
      model: openai/my-fake-model
      api_key: my-fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
-
+  - model_name: claude-special-alias
    litellm_params:
      model: anthropic/claude-3-haiku-20240307
      api_key: os.environ/ANTHROPIC_API_KEY
  - model_name: claude-3-5-sonnet-20241022
    litellm_params:
      model: anthropic/claude-3-5-sonnet-20241022
      api_key: os.environ/ANTHROPIC_API_KEY
  - model_name: claude-3-7-sonnet-20250219
    litellm_params:
      model: anthropic/claude-3-7-sonnet-20250219
      api_key: os.environ/ANTHROPIC_API_KEY
  - model_name: anthropic/*
    litellm_params:
      model: anthropic/*
      api_key: os.environ/ANTHROPIC_API_KEY
 general_settings:
  store_model_in_db: true
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -120,6 +120,7 @@ from litellm.proxy._types import *
 from litellm.proxy.analytics_endpoints.analytics_endpoints import (
    router as analytics_router,
 )
 from litellm.proxy.anthropic_endpoints.endpoints import router as anthropic_router
 from litellm.proxy.auth.auth_checks import log_db_metrics
 from litellm.proxy.auth.auth_utils import check_response_size_is_safe
 from litellm.proxy.auth.handle_jwt import JWTHandler
@ -3065,58 +3066,6 @@ async def async_data_generator(
        yield f"data: {error_returned}\n\n"
 async def async_data_generator_anthropic(
    response, user_api_key_dict: UserAPIKeyAuth, request_data: dict
 ):
    verbose_proxy_logger.debug("inside generator")
    try:
        time.time()
        async for chunk in response:
            verbose_proxy_logger.debug(
                "async_data_generator: received streaming chunk - {}".format(chunk)
            )
            ### CALL HOOKS ### - modify outgoing data
            chunk = await proxy_logging_obj.async_post_call_streaming_hook(
                user_api_key_dict=user_api_key_dict, response=chunk
            )
            event_type = chunk.get("type")
            try:
                yield f"event: {event_type}\ndata:{json.dumps(chunk)}\n\n"
            except Exception as e:
                yield f"event: {event_type}\ndata:{str(e)}\n\n"
    except Exception as e:
        verbose_proxy_logger.exception(
            "litellm.proxy.proxy_server.async_data_generator(): Exception occured - {}".format(
                str(e)
            )
        )
        await proxy_logging_obj.post_call_failure_hook(
            user_api_key_dict=user_api_key_dict,
            original_exception=e,
            request_data=request_data,
        )
        verbose_proxy_logger.debug(
            f"\033[1;31mAn error occurred: {e}\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`"
        )
        if isinstance(e, HTTPException):
            raise e
        else:
            error_traceback = traceback.format_exc()
            error_msg = f"{str(e)}\n\n{error_traceback}"
        proxy_exception = ProxyException(
            message=getattr(e, "message", error_msg),
            type=getattr(e, "type", "None"),
            param=getattr(e, "param", "None"),
            code=getattr(e, "status_code", 500),
        )
        error_returned = json.dumps({"error": proxy_exception.to_dict()})
        yield f"data: {error_returned}\n\n"
 def select_data_generator(
    response, user_api_key_dict: UserAPIKeyAuth, request_data: dict
 ):
@ -5524,224 +5473,6 @@ async def moderations(
            )
 #### ANTHROPIC ENDPOINTS ####
@router.post(
    "/v1/messages",
    tags=["[beta] Anthropic `/v1/messages`"],
    dependencies=[Depends(user_api_key_auth)],
    response_model=AnthropicResponse,
    include_in_schema=False,
 )
 async def anthropic_response(  # noqa: PLR0915
    anthropic_data: AnthropicMessagesRequest,
    fastapi_response: Response,
    request: Request,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    """
    🚨 DEPRECATED ENDPOINT🚨
    Use `{PROXY_BASE_URL}/anthropic/v1/messages` instead - [Docs](https://docs.litellm.ai/docs/anthropic_completion).
    This was a BETA endpoint that calls 100+ LLMs in the anthropic format.
    """
    from litellm import adapter_completion
    from litellm.adapters.anthropic_adapter import anthropic_adapter
    litellm.adapters = [{"id": "anthropic", "adapter": anthropic_adapter}]
    global user_temperature, user_request_timeout, user_max_tokens, user_api_base
    request_data = await _read_request_body(request=request)
    data: dict = {**request_data, "adapter_id": "anthropic"}
    try:
        data["model"] = (
            general_settings.get("completion_model", None)  # server default
            or user_model  # model name passed via cli args
            or data.get("model", None)  # default passed in http request
        )
        if user_model:
            data["model"] = user_model
        data = await add_litellm_data_to_request(
            data=data,  # type: ignore
            request=request,
            general_settings=general_settings,
            user_api_key_dict=user_api_key_dict,
            version=version,
            proxy_config=proxy_config,
        )
        # override with user settings, these are params passed via cli
        if user_temperature:
            data["temperature"] = user_temperature
        if user_request_timeout:
            data["request_timeout"] = user_request_timeout
        if user_max_tokens:
            data["max_tokens"] = user_max_tokens
        if user_api_base:
            data["api_base"] = user_api_base
        ### MODEL ALIAS MAPPING ###
        # check if model name in model alias map
        # get the actual model name
        if data["model"] in litellm.model_alias_map:
            data["model"] = litellm.model_alias_map[data["model"]]
        ### CALL HOOKS ### - modify incoming data before calling the model
        data = await proxy_logging_obj.pre_call_hook(  # type: ignore
            user_api_key_dict=user_api_key_dict, data=data, call_type="text_completion"
        )
        ### ROUTE THE REQUESTs ###
        router_model_names = llm_router.model_names if llm_router is not None else []
        # skip router if user passed their key
        if "api_key" in data:
            llm_response = asyncio.create_task(litellm.aadapter_completion(**data))
        elif (
            llm_router is not None and data["model"] in router_model_names
        ):  # model in router model list
            llm_response = asyncio.create_task(llm_router.aadapter_completion(**data))
        elif (
            llm_router is not None
            and llm_router.model_group_alias is not None
            and data["model"] in llm_router.model_group_alias
        ):  # model set in model_group_alias
            llm_response = asyncio.create_task(llm_router.aadapter_completion(**data))
        elif (
            llm_router is not None and data["model"] in llm_router.deployment_names
        ):  # model in router deployments, calling a specific deployment on the router
            llm_response = asyncio.create_task(
                llm_router.aadapter_completion(**data, specific_deployment=True)
            )
        elif (
            llm_router is not None and data["model"] in llm_router.get_model_ids()
        ):  # model in router model list
            llm_response = asyncio.create_task(llm_router.aadapter_completion(**data))
        elif (
            llm_router is not None
            and data["model"] not in router_model_names
            and (
                llm_router.default_deployment is not None
                or len(llm_router.pattern_router.patterns) > 0
            )
        ):  # model in router deployments, calling a specific deployment on the router
            llm_response = asyncio.create_task(llm_router.aadapter_completion(**data))
        elif user_model is not None:  # `litellm --model <your-model-name>`
            llm_response = asyncio.create_task(litellm.aadapter_completion(**data))
        else:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail={
                    "error": "completion: Invalid model name passed in model="
                    + data.get("model", "")
                },
            )
        # Await the llm_response task
        response = await llm_response
        hidden_params = getattr(response, "_hidden_params", {}) or {}
        model_id = hidden_params.get("model_id", None) or ""
        cache_key = hidden_params.get("cache_key", None) or ""
        api_base = hidden_params.get("api_base", None) or ""
        response_cost = hidden_params.get("response_cost", None) or ""
        ### ALERTING ###
        asyncio.create_task(
            proxy_logging_obj.update_request_status(
                litellm_call_id=data.get("litellm_call_id", ""), status="success"
            )
        )
        verbose_proxy_logger.debug("final response: %s", response)
        fastapi_response.headers.update(
            get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
                api_base=api_base,
                version=version,
                response_cost=response_cost,
                request_data=data,
                hidden_params=hidden_params,
            )
        )
        if (
            "stream" in data and data["stream"] is True
        ):  # use generate_responses to stream responses
            selected_data_generator = async_data_generator_anthropic(
                response=response,
                user_api_key_dict=user_api_key_dict,
                request_data=data,
            )
            return StreamingResponse(
                selected_data_generator,
                media_type="text/event-stream",
            )
        verbose_proxy_logger.info("\nResponse from Litellm:\n{}".format(response))
        return response
    except RejectedRequestError as e:
        _data = e.request_data
        await proxy_logging_obj.post_call_failure_hook(
            user_api_key_dict=user_api_key_dict,
            original_exception=e,
            request_data=_data,
        )
        if _data.get("stream", None) is not None and _data["stream"] is True:
            _chat_response = litellm.ModelResponse()
            _usage = litellm.Usage(
                prompt_tokens=0,
                completion_tokens=0,
                total_tokens=0,
            )
            _chat_response.usage = _usage  # type: ignore
            _chat_response.choices[0].message.content = e.message  # type: ignore
            _iterator = litellm.utils.ModelResponseIterator(
                model_response=_chat_response, convert_to_delta=True
            )
            _streaming_response = litellm.TextCompletionStreamWrapper(
                completion_stream=_iterator,
                model=_data.get("model", ""),
            )
            selected_data_generator = select_data_generator(
                response=_streaming_response,
                user_api_key_dict=user_api_key_dict,
                request_data=data,
            )
            return StreamingResponse(
                selected_data_generator,
                media_type="text/event-stream",
                headers={},
            )
        else:
            _response = litellm.TextCompletionResponse()
            _response.choices[0].text = e.message
            return _response
    except Exception as e:
        await proxy_logging_obj.post_call_failure_hook(
            user_api_key_dict=user_api_key_dict, original_exception=e, request_data=data
        )
        verbose_proxy_logger.exception(
            "litellm.proxy.proxy_server.anthropic_response(): Exception occured - {}".format(
                str(e)
            )
        )
        error_msg = f"{str(e)}"
        raise ProxyException(
            message=getattr(e, "message", error_msg),
            type=getattr(e, "type", "None"),
            param=getattr(e, "param", "None"),
            code=getattr(e, "status_code", 500),
        )
 #### DEV UTILS ####
 # @router.get(
@ -8840,6 +8571,7 @@ app.include_router(rerank_router)
 app.include_router(fine_tuning_router)
 app.include_router(vertex_router)
 app.include_router(llm_passthrough_router)
 app.include_router(anthropic_router)
 app.include_router(langfuse_router)
 app.include_router(pass_through_router)
 app.include_router(health_router)
--- a/litellm/proxy/spend_tracking/spend_tracking_utils.py
+++ b/litellm/proxy/spend_tracking/spend_tracking_utils.py
@ -10,6 +10,7 @@ from pydantic import BaseModel
 import litellm
 from litellm._logging import verbose_proxy_logger
 from litellm.litellm_core_utils.core_helpers import get_litellm_metadata_from_kwargs
 from litellm.proxy._types import SpendLogsMetadata, SpendLogsPayload
 from litellm.proxy.utils import PrismaClient, hash_token
 from litellm.types.utils import StandardLoggingPayload
@ -119,9 +120,7 @@ def get_logging_payload(  # noqa: PLR0915
        response_obj = {}
    # standardize this function to be used across, s3, dynamoDB, langfuse logging
    litellm_params = kwargs.get("litellm_params", {})
-    metadata = (
+    metadata = get_litellm_metadata_from_kwargs(kwargs)
        litellm_params.get("metadata", {}) or {}
    )  # if litellm_params['metadata'] == None
    metadata = _add_proxy_server_request_to_metadata(
        metadata=metadata, litellm_params=litellm_params
    )
--- a/litellm/router.py
+++ b/litellm/router.py
@ -580,6 +580,9 @@ class Router:
        self.amoderation = self.factory_function(
            litellm.amoderation, call_type="moderation"
        )
        self.aanthropic_messages = self.factory_function(
            litellm.anthropic_messages, call_type="anthropic_messages"
        )
    def discard(self):
        """
@ -2349,6 +2352,89 @@ class Router:
                self.fail_calls[model] += 1
            raise e
    async def _ageneric_api_call_with_fallbacks(
        self, model: str, original_function: Callable, **kwargs
    ):
        """
        Make a generic LLM API call through the router, this allows you to use retries/fallbacks with litellm router
        Args:
            model: The model to use
            handler_function: The handler function to call (e.g., litellm.anthropic_messages)
            **kwargs: Additional arguments to pass to the handler function
        Returns:
            The response from the handler function
        """
        handler_name = original_function.__name__
        try:
            verbose_router_logger.debug(
                f"Inside _ageneric_api_call() - handler: {handler_name}, model: {model}; kwargs: {kwargs}"
            )
            parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
            deployment = await self.async_get_available_deployment(
                model=model,
                request_kwargs=kwargs,
                messages=kwargs.get("messages", None),
                specific_deployment=kwargs.pop("specific_deployment", None),
            )
            self._update_kwargs_with_deployment(deployment=deployment, kwargs=kwargs)
            data = deployment["litellm_params"].copy()
            model_name = data["model"]
            model_client = self._get_async_openai_model_client(
                deployment=deployment,
                kwargs=kwargs,
            )
            self.total_calls[model_name] += 1
            response = original_function(
                **{
                    **data,
                    "caching": self.cache_responses,
                    "client": model_client,
                    **kwargs,
                }
            )
            rpm_semaphore = self._get_client(
                deployment=deployment,
                kwargs=kwargs,
                client_type="max_parallel_requests",
            )
            if rpm_semaphore is not None and isinstance(
                rpm_semaphore, asyncio.Semaphore
            ):
                async with rpm_semaphore:
                    """
                    - Check rpm limits before making the call
                    - If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
                    """
                    await self.async_routing_strategy_pre_call_checks(
                        deployment=deployment, parent_otel_span=parent_otel_span
                    )
                    response = await response  # type: ignore
            else:
                await self.async_routing_strategy_pre_call_checks(
                    deployment=deployment, parent_otel_span=parent_otel_span
                )
                response = await response  # type: ignore
            self.success_calls[model_name] += 1
            verbose_router_logger.info(
                f"{handler_name}(model={model_name})\033[32m 200 OK\033[0m"
            )
            return response
        except Exception as e:
            verbose_router_logger.info(
                f"{handler_name}(model={model})\033[31m Exception {str(e)}\033[0m"
            )
            if model is not None:
                self.fail_calls[model] += 1
            raise e
    def embedding(
        self,
        model: str,
@ -2869,10 +2955,14 @@ class Router:
    def factory_function(
        self,
        original_function: Callable,
-        call_type: Literal["assistants", "moderation"] = "assistants",
+        call_type: Literal[
            "assistants", "moderation", "anthropic_messages"
        ] = "assistants",
    ):
        async def new_function(
-            custom_llm_provider: Optional[Literal["openai", "azure"]] = None,
+            custom_llm_provider: Optional[
                Literal["openai", "azure", "anthropic"]
            ] = None,
            client: Optional["AsyncOpenAI"] = None,
            **kwargs,
        ):
@ -2889,13 +2979,18 @@ class Router:
                    original_function=original_function,
                    **kwargs,
                )
            elif call_type == "anthropic_messages":
                return await self._ageneric_api_call_with_fallbacks(  # type: ignore
                    original_function=original_function,
                    **kwargs,
                )
        return new_function
    async def _pass_through_assistants_endpoint_factory(
        self,
        original_function: Callable,
-        custom_llm_provider: Optional[Literal["openai", "azure"]] = None,
+        custom_llm_provider: Optional[Literal["openai", "azure", "anthropic"]] = None,
        client: Optional[AsyncOpenAI] = None,
        **kwargs,
    ):
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -186,6 +186,7 @@ class CallTypes(Enum):
    aretrieve_batch = "aretrieve_batch"
    retrieve_batch = "retrieve_batch"
    pass_through = "pass_through_endpoint"
    anthropic_messages = "anthropic_messages"
 CallTypesLiteral = Literal[
@ -209,6 +210,7 @@ CallTypesLiteral = Literal[
    "create_batch",
    "acreate_batch",
    "pass_through_endpoint",
    "anthropic_messages",
 ]
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -191,6 +191,9 @@ from typing import (
 from openai import OpenAIError as OriginalError
 from litellm.litellm_core_utils.thread_pool_executor import executor
 from litellm.llms.base_llm.anthropic_messages.transformation import (
    BaseAnthropicMessagesConfig,
 )
 from litellm.llms.base_llm.audio_transcription.transformation import (
    BaseAudioTranscriptionConfig,
 )
@ -6245,6 +6248,15 @@ class ProviderConfigManager:
            return litellm.JinaAIRerankConfig()
        return litellm.CohereRerankConfig()
    @staticmethod
    def get_provider_anthropic_messages_config(
        model: str,
        provider: LlmProviders,
    ) -> Optional[BaseAnthropicMessagesConfig]:
        if litellm.LlmProviders.ANTHROPIC == provider:
            return litellm.AnthropicMessagesConfig()
        return None
    @staticmethod
    def get_provider_audio_transcription_config(
        model: str,
--- a/tests/local_testing/test_pass_through_endpoints.py
+++ b/tests/local_testing/test_pass_through_endpoints.py
@ -329,57 +329,3 @@ async def test_aaapass_through_endpoint_pass_through_keys_langfuse(
        setattr(
            litellm.proxy.proxy_server, "proxy_logging_obj", original_proxy_logging_obj
        )
@pytest.mark.asyncio
 async def test_pass_through_endpoint_anthropic(client):
    import litellm
    from litellm import Router
    from litellm.adapters.anthropic_adapter import anthropic_adapter
    router = Router(
        model_list=[
            {
                "model_name": "gpt-3.5-turbo",
                "litellm_params": {
                    "model": "gpt-3.5-turbo",
                    "api_key": os.getenv("OPENAI_API_KEY"),
                    "mock_response": "Hey, how's it going?",
                },
            }
        ]
    )
    setattr(litellm.proxy.proxy_server, "llm_router", router)
    # Define a pass-through endpoint
    pass_through_endpoints = [
        {
            "path": "/v1/test-messages",
            "target": anthropic_adapter,
            "headers": {"litellm_user_api_key": "my-test-header"},
        }
    ]
    # Initialize the pass-through endpoint
    await initialize_pass_through_endpoints(pass_through_endpoints)
    general_settings: Optional[dict] = (
        getattr(litellm.proxy.proxy_server, "general_settings", {}) or {}
    )
    general_settings.update({"pass_through_endpoints": pass_through_endpoints})
    setattr(litellm.proxy.proxy_server, "general_settings", general_settings)
    _json_data = {
        "model": "gpt-3.5-turbo",
        "messages": [{"role": "user", "content": "Who are you?"}],
    }
    # Make a request to the pass-through endpoint
    response = client.post(
        "/v1/test-messages", json=_json_data, headers={"my-test-header": "my-test-key"}
    )
    print("JSON response: ", _json_data)
    # Assert the response
    assert response.status_code == 200
--- a/tests/pass_through_tests/base_anthropic_messages_test.py
+++ b/tests/pass_through_tests/base_anthropic_messages_test.py
@ -0,0 +1,145 @@
 from abc import ABC, abstractmethod
 import anthropic
 import pytest
 class BaseAnthropicMessagesTest(ABC):
    """
    Abstract base test class that enforces a common test across all test classes.
    """
    @abstractmethod
    def get_client(self):
        return anthropic.Anthropic()
    def test_anthropic_basic_completion(self):
        print("making basic completion request to anthropic passthrough")
        client = self.get_client()
        response = client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=1024,
            messages=[{"role": "user", "content": "Say 'hello test' and nothing else"}],
            extra_body={
                "litellm_metadata": {
                    "tags": ["test-tag-1", "test-tag-2"],
                }
            },
        )
        print(response)
    def test_anthropic_streaming(self):
        print("making streaming request to anthropic passthrough")
        collected_output = []
        client = self.get_client()
        with client.messages.stream(
            max_tokens=10,
            messages=[
                {"role": "user", "content": "Say 'hello stream test' and nothing else"}
            ],
            model="claude-3-5-sonnet-20241022",
            extra_body={
                "litellm_metadata": {
                    "tags": ["test-tag-stream-1", "test-tag-stream-2"],
                }
            },
        ) as stream:
            for text in stream.text_stream:
                collected_output.append(text)
        full_response = "".join(collected_output)
        print(full_response)
    def test_anthropic_messages_with_thinking(self):
        print("making request to anthropic passthrough with thinking")
        client = self.get_client()
        response = client.messages.create(
            model="claude-3-7-sonnet-20250219",
            max_tokens=20000,
            thinking={"type": "enabled", "budget_tokens": 16000},
            messages=[
                {"role": "user", "content": "Just pinging with thinking enabled"}
            ],
        )
        print(response)
        # Verify the first content block is a thinking block
        response_thinking = response.content[0].thinking
        assert response_thinking is not None
        assert len(response_thinking) > 0
    def test_anthropic_streaming_with_thinking(self):
        print("making streaming request to anthropic passthrough with thinking enabled")
        collected_thinking = []
        collected_response = []
        client = self.get_client()
        with client.messages.stream(
            model="claude-3-7-sonnet-20250219",
            max_tokens=20000,
            thinking={"type": "enabled", "budget_tokens": 16000},
            messages=[
                {"role": "user", "content": "Just pinging with thinking enabled"}
            ],
        ) as stream:
            for event in stream:
                if event.type == "content_block_delta":
                    if event.delta.type == "thinking_delta":
                        collected_thinking.append(event.delta.thinking)
                    elif event.delta.type == "text_delta":
                        collected_response.append(event.delta.text)
        full_thinking = "".join(collected_thinking)
        full_response = "".join(collected_response)
        print(
            f"Thinking Response: {full_thinking[:100]}..."
        )  # Print first 100 chars of thinking
        print(f"Response: {full_response}")
        # Verify we received thinking content
        assert len(collected_thinking) > 0
        assert len(full_thinking) > 0
        # Verify we also received a response
        assert len(collected_response) > 0
        assert len(full_response) > 0
    def test_bad_request_error_handling_streaming(self):
        print("making request to anthropic passthrough with bad request")
        try:
            client = self.get_client()
            response = client.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=10,
                stream=True,
                messages=["hi"],
            )
            print(response)
            assert pytest.fail("Expected BadRequestError")
        except anthropic.BadRequestError as e:
            print("Got BadRequestError from anthropic, e=", e)
            print(e.__cause__)
            print(e.status_code)
            print(e.response)
        except Exception as e:
            pytest.fail(f"Got unexpected exception: {e}")
    def test_bad_request_error_handling_non_streaming(self):
        print("making request to anthropic passthrough with bad request")
        try:
            client = self.get_client()
            response = client.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=10,
                messages=["hi"],
            )
            print(response)
            assert pytest.fail("Expected BadRequestError")
        except anthropic.BadRequestError as e:
            print("Got BadRequestError from anthropic, e=", e)
            print(e.__cause__)
            print(e.status_code)
            print(e.response)
        except Exception as e:
            pytest.fail(f"Got unexpected exception: {e}")
--- a/tests/pass_through_tests/test_anthropic_passthrough.py
+++ b/tests/pass_through_tests/test_anthropic_passthrough.py
@ -8,48 +8,6 @@ import aiohttp
 import asyncio
 import json
 client = anthropic.Anthropic(
    base_url="http://0.0.0.0:4000/anthropic", api_key="sk-1234"
 )
 def test_anthropic_basic_completion():
    print("making basic completion request to anthropic passthrough")
    response = client.messages.create(
        model="claude-3-5-sonnet-20241022",
        max_tokens=1024,
        messages=[{"role": "user", "content": "Say 'hello test' and nothing else"}],
        extra_body={
            "litellm_metadata": {
                "tags": ["test-tag-1", "test-tag-2"],
            }
        },
    )
    print(response)
 def test_anthropic_streaming():
    print("making streaming request to anthropic passthrough")
    collected_output = []
    with client.messages.stream(
        max_tokens=10,
        messages=[
            {"role": "user", "content": "Say 'hello stream test' and nothing else"}
        ],
        model="claude-3-5-sonnet-20241022",
        extra_body={
            "litellm_metadata": {
                "tags": ["test-tag-stream-1", "test-tag-stream-2"],
            }
        },
    ) as stream:
        for text in stream.text_stream:
            collected_output.append(text)
    full_response = "".join(collected_output)
    print(full_response)
@pytest.mark.asyncio
 async def test_anthropic_basic_completion_with_headers():
--- a/tests/pass_through_tests/test_anthropic_passthrough_basic.py
+++ b/tests/pass_through_tests/test_anthropic_passthrough_basic.py
@ -0,0 +1,28 @@
 from base_anthropic_messages_test import BaseAnthropicMessagesTest
 import anthropic
 class TestAnthropicPassthroughBasic(BaseAnthropicMessagesTest):
    def get_client(self):
        return anthropic.Anthropic(
            base_url="http://0.0.0.0:4000/anthropic",
            api_key="sk-1234",
        )
 class TestAnthropicMessagesEndpoint(BaseAnthropicMessagesTest):
    def get_client(self):
        return anthropic.Anthropic(
            base_url="http://0.0.0.0:4000",
            api_key="sk-1234",
        )
    def test_anthropic_messages_to_wildcard_model(self):
        client = self.get_client()
        response = client.messages.create(
            model="anthropic/claude-3-opus-20240229",
            messages=[{"role": "user", "content": "Hello, world!"}],
            max_tokens=100,
        )
        print(response)
--- a/tests/pass_through_unit_tests/test_anthropic_messages_passthrough.py
+++ b/tests/pass_through_unit_tests/test_anthropic_messages_passthrough.py
@ -0,0 +1,487 @@
 import json
 import os
 import sys
 from datetime import datetime
 from typing import AsyncIterator, Dict, Any
 import asyncio
 import unittest.mock
 from unittest.mock import AsyncMock, MagicMock
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 import pytest
 from dotenv import load_dotenv
 from litellm.llms.anthropic.experimental_pass_through.messages.handler import (
    anthropic_messages,
 )
 from typing import Optional
 from litellm.types.utils import StandardLoggingPayload
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
 from litellm.router import Router
 import importlib
 # Load environment variables
 load_dotenv()
@pytest.fixture(scope="session")
 def event_loop():
    """Create an instance of the default event loop for each test session."""
    loop = asyncio.get_event_loop_policy().new_event_loop()
    yield loop
    loop.close()
@pytest.fixture(scope="function", autouse=True)
 def setup_and_teardown(event_loop):  # Add event_loop as a dependency
    curr_dir = os.getcwd()
    sys.path.insert(0, os.path.abspath("../.."))
    import litellm
    from litellm import Router
    importlib.reload(litellm)
    # Set the event loop from the fixture
    asyncio.set_event_loop(event_loop)
    print(litellm)
    yield
    # Clean up any pending tasks
    pending = asyncio.all_tasks(event_loop)
    for task in pending:
        task.cancel()
    # Run the event loop until all tasks are cancelled
    if pending:
        event_loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
 def _validate_anthropic_response(response: Dict[str, Any]):
    assert "id" in response
    assert "content" in response
    assert "model" in response
    assert response["role"] == "assistant"
@pytest.mark.asyncio
 async def test_anthropic_messages_non_streaming():
    """
    Test the anthropic_messages with non-streaming request
    """
    # Get API key from environment
    api_key = os.getenv("ANTHROPIC_API_KEY")
    if not api_key:
        pytest.skip("ANTHROPIC_API_KEY not found in environment")
    # Set up test parameters
    messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}]
    # Call the handler
    response = await anthropic_messages(
        messages=messages,
        api_key=api_key,
        model="claude-3-haiku-20240307",
        max_tokens=100,
    )
    # Verify response
    assert "id" in response
    assert "content" in response
    assert "model" in response
    assert response["role"] == "assistant"
    print(f"Non-streaming response: {json.dumps(response, indent=2)}")
    return response
@pytest.mark.asyncio
 async def test_anthropic_messages_streaming():
    """
    Test the anthropic_messages with streaming request
    """
    # Get API key from environment
    api_key = os.getenv("ANTHROPIC_API_KEY")
    if not api_key:
        pytest.skip("ANTHROPIC_API_KEY not found in environment")
    # Set up test parameters
    messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}]
    # Call the handler
    async_httpx_client = AsyncHTTPHandler()
    response = await anthropic_messages(
        messages=messages,
        api_key=api_key,
        model="claude-3-haiku-20240307",
        max_tokens=100,
        stream=True,
        client=async_httpx_client,
    )
    if isinstance(response, AsyncIterator):
        async for chunk in response:
            print("chunk=", chunk)
@pytest.mark.asyncio
 async def test_anthropic_messages_streaming_with_bad_request():
    """
    Test the anthropic_messages with streaming request
    """
    try:
        response = await anthropic_messages(
            messages=["hi"],
            api_key=os.getenv("ANTHROPIC_API_KEY"),
            model="claude-3-haiku-20240307",
            max_tokens=100,
            stream=True,
        )
        print(response)
        async for chunk in response:
            print("chunk=", chunk)
    except Exception as e:
        print("got exception", e)
        print("vars", vars(e))
        assert e.status_code == 400
@pytest.mark.asyncio
 async def test_anthropic_messages_router_streaming_with_bad_request():
    """
    Test the anthropic_messages with streaming request
    """
    try:
        router = Router(
            model_list=[
                {
                    "model_name": "claude-special-alias",
                    "litellm_params": {
                        "model": "claude-3-haiku-20240307",
                        "api_key": os.getenv("ANTHROPIC_API_KEY"),
                    },
                }
            ]
        )
        response = await router.aanthropic_messages(
            messages=["hi"],
            model="claude-special-alias",
            max_tokens=100,
            stream=True,
        )
        print(response)
        async for chunk in response:
            print("chunk=", chunk)
    except Exception as e:
        print("got exception", e)
        print("vars", vars(e))
        assert e.status_code == 400
@pytest.mark.asyncio
 async def test_anthropic_messages_litellm_router_non_streaming():
    """
    Test the anthropic_messages with non-streaming request
    """
    litellm._turn_on_debug()
    router = Router(
        model_list=[
            {
                "model_name": "claude-special-alias",
                "litellm_params": {
                    "model": "claude-3-haiku-20240307",
                    "api_key": os.getenv("ANTHROPIC_API_KEY"),
                },
            }
        ]
    )
    # Set up test parameters
    messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}]
    # Call the handler
    response = await router.aanthropic_messages(
        messages=messages,
        model="claude-special-alias",
        max_tokens=100,
    )
    # Verify response
    assert "id" in response
    assert "content" in response
    assert "model" in response
    assert response["role"] == "assistant"
    print(f"Non-streaming response: {json.dumps(response, indent=2)}")
    return response
 class TestCustomLogger(CustomLogger):
    def __init__(self):
        super().__init__()
        self.logged_standard_logging_payload: Optional[StandardLoggingPayload] = None
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        print("inside async_log_success_event")
        self.logged_standard_logging_payload = kwargs.get("standard_logging_object")
        pass
@pytest.mark.asyncio
 async def test_anthropic_messages_litellm_router_non_streaming_with_logging():
    """
    Test the anthropic_messages with non-streaming request
    - Ensure Cost + Usage is tracked
    """
    test_custom_logger = TestCustomLogger()
    litellm.callbacks = [test_custom_logger]
    litellm._turn_on_debug()
    router = Router(
        model_list=[
            {
                "model_name": "claude-special-alias",
                "litellm_params": {
                    "model": "claude-3-haiku-20240307",
                    "api_key": os.getenv("ANTHROPIC_API_KEY"),
                },
            }
        ]
    )
    # Set up test parameters
    messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}]
    # Call the handler
    response = await router.aanthropic_messages(
        messages=messages,
        model="claude-special-alias",
        max_tokens=100,
    )
    # Verify response
    _validate_anthropic_response(response)
    print(f"Non-streaming response: {json.dumps(response, indent=2)}")
    await asyncio.sleep(1)
    assert test_custom_logger.logged_standard_logging_payload["messages"] == messages
    assert test_custom_logger.logged_standard_logging_payload["response"] is not None
    assert (
        test_custom_logger.logged_standard_logging_payload["model"]
        == "claude-3-haiku-20240307"
    )
    # check logged usage + spend
    assert test_custom_logger.logged_standard_logging_payload["response_cost"] > 0
    assert (
        test_custom_logger.logged_standard_logging_payload["prompt_tokens"]
        == response["usage"]["input_tokens"]
    )
    assert (
        test_custom_logger.logged_standard_logging_payload["completion_tokens"]
        == response["usage"]["output_tokens"]
    )
@pytest.mark.asyncio
 async def test_anthropic_messages_litellm_router_streaming_with_logging():
    """
    Test the anthropic_messages with streaming request
    - Ensure Cost + Usage is tracked
    """
    test_custom_logger = TestCustomLogger()
    litellm.callbacks = [test_custom_logger]
    # litellm._turn_on_debug()
    router = Router(
        model_list=[
            {
                "model_name": "claude-special-alias",
                "litellm_params": {
                    "model": "claude-3-haiku-20240307",
                    "api_key": os.getenv("ANTHROPIC_API_KEY"),
                },
            }
        ]
    )
    # Set up test parameters
    messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}]
    # Call the handler
    response = await router.aanthropic_messages(
        messages=messages,
        model="claude-special-alias",
        max_tokens=100,
        stream=True,
    )
    response_prompt_tokens = 0
    response_completion_tokens = 0
    all_anthropic_usage_chunks = []
    async for chunk in response:
        # Decode chunk if it's bytes
        print("chunk=", chunk)
        # Handle SSE format chunks
        if isinstance(chunk, bytes):
            chunk_str = chunk.decode("utf-8")
            # Extract the JSON data part from SSE format
            for line in chunk_str.split("\n"):
                if line.startswith("data: "):
                    try:
                        json_data = json.loads(line[6:])  # Skip the 'data: ' prefix
                        print(
                            "\n\nJSON data:",
                            json.dumps(json_data, indent=4, default=str),
                        )
                        # Extract usage information
                        if (
                            json_data.get("type") == "message_start"
                            and "message" in json_data
                        ):
                            if "usage" in json_data["message"]:
                                usage = json_data["message"]["usage"]
                                all_anthropic_usage_chunks.append(usage)
                                print(
                                    "USAGE BLOCK",
                                    json.dumps(usage, indent=4, default=str),
                                )
                        elif "usage" in json_data:
                            usage = json_data["usage"]
                            all_anthropic_usage_chunks.append(usage)
                            print(
                                "USAGE BLOCK", json.dumps(usage, indent=4, default=str)
                            )
                    except json.JSONDecodeError:
                        print(f"Failed to parse JSON from: {line[6:]}")
        elif hasattr(chunk, "message"):
            if chunk.message.usage:
                print(
                    "USAGE BLOCK",
                    json.dumps(chunk.message.usage, indent=4, default=str),
                )
                all_anthropic_usage_chunks.append(chunk.message.usage)
        elif hasattr(chunk, "usage"):
            print("USAGE BLOCK", json.dumps(chunk.usage, indent=4, default=str))
            all_anthropic_usage_chunks.append(chunk.usage)
    print(
        "all_anthropic_usage_chunks",
        json.dumps(all_anthropic_usage_chunks, indent=4, default=str),
    )
    # Extract token counts from usage data
    if all_anthropic_usage_chunks:
        response_prompt_tokens = max(
            [usage.get("input_tokens", 0) for usage in all_anthropic_usage_chunks]
        )
        response_completion_tokens = max(
            [usage.get("output_tokens", 0) for usage in all_anthropic_usage_chunks]
        )
    print("input_tokens_anthropic_api", response_prompt_tokens)
    print("output_tokens_anthropic_api", response_completion_tokens)
    await asyncio.sleep(4)
    print(
        "logged_standard_logging_payload",
        json.dumps(
            test_custom_logger.logged_standard_logging_payload, indent=4, default=str
        ),
    )
    assert test_custom_logger.logged_standard_logging_payload["messages"] == messages
    assert test_custom_logger.logged_standard_logging_payload["response"] is not None
    assert (
        test_custom_logger.logged_standard_logging_payload["model"]
        == "claude-3-haiku-20240307"
    )
    # check logged usage + spend
    assert test_custom_logger.logged_standard_logging_payload["response_cost"] > 0
    assert (
        test_custom_logger.logged_standard_logging_payload["prompt_tokens"]
        == response_prompt_tokens
    )
    assert (
        test_custom_logger.logged_standard_logging_payload["completion_tokens"]
        == response_completion_tokens
    )
@pytest.mark.asyncio
 async def test_anthropic_messages_with_extra_headers():
    """
    Test the anthropic_messages with extra headers
    """
    # Get API key from environment
    api_key = os.getenv("ANTHROPIC_API_KEY", "fake-api-key")
    # Set up test parameters
    messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}]
    extra_headers = {
        "anthropic-beta": "very-custom-beta-value",
        "anthropic-version": "custom-version-for-test",
    }
    # Create a mock response
    mock_response = MagicMock()
    mock_response.raise_for_status = MagicMock()
    mock_response.json.return_value = {
        "id": "msg_123456",
        "type": "message",
        "role": "assistant",
        "content": [
            {
                "type": "text",
                "text": "Why did the chicken cross the road? To get to the other side!",
            }
        ],
        "model": "claude-3-haiku-20240307",
        "stop_reason": "end_turn",
        "usage": {"input_tokens": 10, "output_tokens": 20},
    }
    # Create a mock client with AsyncMock for the post method
    mock_client = MagicMock(spec=AsyncHTTPHandler)
    mock_client.post = AsyncMock(return_value=mock_response)
    # Call the handler with extra_headers and our mocked client
    response = await anthropic_messages(
        messages=messages,
        api_key=api_key,
        model="claude-3-haiku-20240307",
        max_tokens=100,
        client=mock_client,
        provider_specific_header={
            "custom_llm_provider": "anthropic",
            "extra_headers": extra_headers,
        },
    )
    # Verify the post method was called with the right parameters
    mock_client.post.assert_called_once()
    call_kwargs = mock_client.post.call_args.kwargs
    # Verify headers were passed correctly
    headers = call_kwargs.get("headers", {})
    print("HEADERS IN REQUEST", headers)
    for key, value in extra_headers.items():
        assert key in headers
        assert headers[key] == value
    # Verify the response was processed correctly
    assert response == mock_response.json.return_value
    return response
--- a/tests/pass_through_unit_tests/test_unit_test_vertex_pass_through.py
+++ b/tests/pass_through_unit_tests/test_unit_test_vertex_pass_through.py
@ -54,7 +54,7 @@ async def test_get_litellm_virtual_key():
@pytest.mark.asyncio
-async def test_vertex_proxy_route_api_key_auth():
+async def test_async_vertex_proxy_route_api_key_auth():
    """
    Critical
@ -207,7 +207,7 @@ async def test_get_vertex_credentials_stored():
    router.add_vertex_credentials(
        project_id="test-project",
        location="us-central1",
-        vertex_credentials="test-creds",
+        vertex_credentials='{"credentials": "test-creds"}',
    )
    creds = router.get_vertex_credentials(
@ -215,7 +215,7 @@ async def test_get_vertex_credentials_stored():
    )
    assert creds.vertex_project == "test-project"
    assert creds.vertex_location == "us-central1"
-    assert creds.vertex_credentials == "test-creds"
+    assert creds.vertex_credentials == '{"credentials": "test-creds"}'
@pytest.mark.asyncio
@ -227,18 +227,20 @@ async def test_add_vertex_credentials():
    router.add_vertex_credentials(
        project_id="test-project",
        location="us-central1",
-        vertex_credentials="test-creds",
+        vertex_credentials='{"credentials": "test-creds"}',
    )
    assert "test-project-us-central1" in router.deployment_key_to_vertex_credentials
    creds = router.deployment_key_to_vertex_credentials["test-project-us-central1"]
    assert creds.vertex_project == "test-project"
    assert creds.vertex_location == "us-central1"
-    assert creds.vertex_credentials == "test-creds"
+    assert creds.vertex_credentials == '{"credentials": "test-creds"}'
    # Test adding with None values
    router.add_vertex_credentials(
-        project_id=None, location=None, vertex_credentials="test-creds"
+        project_id=None,
        location=None,
        vertex_credentials='{"credentials": "test-creds"}',
    )
    # Should not add None values
    assert len(router.deployment_key_to_vertex_credentials) == 1
--- a/tests/router_unit_tests/test_router_endpoints.py
+++ b/tests/router_unit_tests/test_router_endpoints.py
@ -6,6 +6,7 @@ from typing import Optional
 from dotenv import load_dotenv
 from fastapi import Request
 from datetime import datetime
 from unittest.mock import AsyncMock, patch
 sys.path.insert(
    0, os.path.abspath("../..")
@ -289,43 +290,6 @@ async def test_aaaaatext_completion_endpoint(model_list, sync_mode):
    assert response.choices[0].text == "I'm fine, thank you!"
@pytest.mark.asyncio
 async def test_anthropic_router_completion_e2e(model_list):
    from litellm.adapters.anthropic_adapter import anthropic_adapter
    from litellm.types.llms.anthropic import AnthropicResponse
    litellm.set_verbose = True
    litellm.adapters = [{"id": "anthropic", "adapter": anthropic_adapter}]
    router = Router(model_list=model_list)
    messages = [{"role": "user", "content": "Hey, how's it going?"}]
    ## Test 1: user facing function
    response = await router.aadapter_completion(
        model="claude-3-5-sonnet-20240620",
        messages=messages,
        adapter_id="anthropic",
        mock_response="This is a fake call",
    )
    ## Test 2: underlying function
    await router._aadapter_completion(
        model="claude-3-5-sonnet-20240620",
        messages=messages,
        adapter_id="anthropic",
        mock_response="This is a fake call",
    )
    print("Response: {}".format(response))
    assert response is not None
    AnthropicResponse.model_validate(response)
    assert response.model == "gpt-3.5-turbo"
@pytest.mark.asyncio
 async def test_router_with_empty_choices(model_list):
    """
@ -349,3 +313,200 @@ async def test_router_with_empty_choices(model_list):
        mock_response=mock_response,
    )
    assert response is not None
@pytest.mark.asyncio
 async def test_ageneric_api_call_with_fallbacks_basic():
    """
    Test the _ageneric_api_call_with_fallbacks method with a basic successful call
    """
    # Create a mock function that will be passed to _ageneric_api_call_with_fallbacks
    mock_function = AsyncMock()
    mock_function.__name__ = "test_function"
    # Create a mock response
    mock_response = {
        "id": "resp_123456",
        "role": "assistant",
        "content": "This is a test response",
        "model": "test-model",
        "usage": {"input_tokens": 10, "output_tokens": 20},
    }
    mock_function.return_value = mock_response
    # Create a router with a test model
    router = Router(
        model_list=[
            {
                "model_name": "test-model-alias",
                "litellm_params": {
                    "model": "anthropic/test-model",
                    "api_key": "fake-api-key",
                },
            }
        ]
    )
    # Call the _ageneric_api_call_with_fallbacks method
    response = await router._ageneric_api_call_with_fallbacks(
        model="test-model-alias",
        original_function=mock_function,
        messages=[{"role": "user", "content": "Hello"}],
        max_tokens=100,
    )
    # Verify the mock function was called
    mock_function.assert_called_once()
    # Verify the response
    assert response == mock_response
@pytest.mark.asyncio
 async def test_aadapter_completion():
    """
    Test the aadapter_completion method which uses async_function_with_fallbacks
    """
    # Create a mock for the _aadapter_completion method
    mock_response = {
        "id": "adapter_resp_123",
        "object": "adapter.completion",
        "created": 1677858242,
        "model": "test-model-with-adapter",
        "choices": [
            {
                "text": "This is a test adapter response",
                "index": 0,
                "finish_reason": "stop",
            }
        ],
        "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30},
    }
    # Create a router with a patched _aadapter_completion method
    with patch.object(
        Router, "_aadapter_completion", new_callable=AsyncMock
    ) as mock_method:
        mock_method.return_value = mock_response
        router = Router(
            model_list=[
                {
                    "model_name": "test-adapter-model",
                    "litellm_params": {
                        "model": "anthropic/test-model",
                        "api_key": "fake-api-key",
                    },
                }
            ]
        )
        # Replace the async_function_with_fallbacks with a mock
        router.async_function_with_fallbacks = AsyncMock(return_value=mock_response)
        # Call the aadapter_completion method
        response = await router.aadapter_completion(
            adapter_id="test-adapter-id",
            model="test-adapter-model",
            prompt="This is a test prompt",
            max_tokens=100,
        )
        # Verify the response
        assert response == mock_response
        # Verify async_function_with_fallbacks was called with the right parameters
        router.async_function_with_fallbacks.assert_called_once()
        call_kwargs = router.async_function_with_fallbacks.call_args.kwargs
        assert call_kwargs["adapter_id"] == "test-adapter-id"
        assert call_kwargs["model"] == "test-adapter-model"
        assert call_kwargs["prompt"] == "This is a test prompt"
        assert call_kwargs["max_tokens"] == 100
        assert call_kwargs["original_function"] == router._aadapter_completion
        assert "metadata" in call_kwargs
        assert call_kwargs["metadata"]["model_group"] == "test-adapter-model"
@pytest.mark.asyncio
 async def test__aadapter_completion():
    """
    Test the _aadapter_completion method directly
    """
    # Create a mock response for litellm.aadapter_completion
    mock_response = {
        "id": "adapter_resp_123",
        "object": "adapter.completion",
        "created": 1677858242,
        "model": "test-model-with-adapter",
        "choices": [
            {
                "text": "This is a test adapter response",
                "index": 0,
                "finish_reason": "stop",
            }
        ],
        "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30},
    }
    # Create a router with a mocked litellm.aadapter_completion
    with patch(
        "litellm.aadapter_completion", new_callable=AsyncMock
    ) as mock_adapter_completion:
        mock_adapter_completion.return_value = mock_response
        router = Router(
            model_list=[
                {
                    "model_name": "test-adapter-model",
                    "litellm_params": {
                        "model": "anthropic/test-model",
                        "api_key": "fake-api-key",
                    },
                }
            ]
        )
        # Mock the async_get_available_deployment method
        router.async_get_available_deployment = AsyncMock(
            return_value={
                "model_name": "test-adapter-model",
                "litellm_params": {
                    "model": "test-model",
                    "api_key": "fake-api-key",
                },
                "model_info": {
                    "id": "test-unique-id",
                },
            }
        )
        # Mock the async_routing_strategy_pre_call_checks method
        router.async_routing_strategy_pre_call_checks = AsyncMock()
        # Call the _aadapter_completion method
        response = await router._aadapter_completion(
            adapter_id="test-adapter-id",
            model="test-adapter-model",
            prompt="This is a test prompt",
            max_tokens=100,
        )
        # Verify the response
        assert response == mock_response
        # Verify litellm.aadapter_completion was called with the right parameters
        mock_adapter_completion.assert_called_once()
        call_kwargs = mock_adapter_completion.call_args.kwargs
        assert call_kwargs["adapter_id"] == "test-adapter-id"
        assert call_kwargs["model"] == "test-model"
        assert call_kwargs["prompt"] == "This is a test prompt"
        assert call_kwargs["max_tokens"] == 100
        assert call_kwargs["api_key"] == "fake-api-key"
        assert call_kwargs["caching"] == router.cache_responses
        # Verify the success call was recorded
        assert router.success_calls["test-model"] == 1
        assert router.total_calls["test-model"] == 1
        # Verify async_routing_strategy_pre_call_checks was called
        router.async_routing_strategy_pre_call_checks.assert_called_once()