LiteLLM Minor Fixes & Improvements (12/05/2024) (#7037)

* fix(together_ai/chat): only return response_format + tools for supported models Fixes https://github.com/BerriAI/litellm/issues/6972 * feat(bedrock/rerank): initial working commit for bedrock rerank api support Closes https://github.com/BerriAI/litellm/issues/7021 * feat(bedrock/rerank): async bedrock rerank api support Addresses https://github.com/BerriAI/litellm/issues/7021 * build(model_prices_and_context_window.json): add 'supports_prompt_caching' for bedrock models + cleanup cross-region from model list (duplicate information - lead to inconsistencies ) * docs(json_mode.md): clarify model support for json schema Closes https://github.com/BerriAI/litellm/issues/6998 * fix(_service_logger.py): handle dd callback in list ensure failed spend tracking is logged to datadog * feat(converse_transformation.py): translate from anthropic format to bedrock format Closes https://github.com/BerriAI/litellm/issues/7030 * fix: fix linting errors * test: fix test
2025-04-25 18:54:30 +00:00 · 2024-12-05 00:02:31 -08:00 · 2024-12-05 00:02:31 -08:00 · 61b35c12bb
commit 61b35c12bb
parent 12dfd14b52
24 changed files with 858 additions and 400 deletions
--- a/docs/my-website/docs/completion/json_mode.md
+++ b/docs/my-website/docs/completion/json_mode.md
@ -51,6 +51,9 @@ curl http://0.0.0.0:4000/v1/chat/completions \
 ## Check Model Support 
 ### 1. Check if model supports `response_format`
 Call `litellm.get_supported_openai_params` to check if a model/provider supports `response_format`. 
 ```python
@ -61,6 +64,20 @@ params = get_supported_openai_params(model="anthropic.claude-3", custom_llm_prov
 assert "response_format" in params
 ```
 ### 2. Check if model supports `json_schema`
 This is used to check if you can pass 
 - `response_format={ "type": "json_schema", "json_schema": … , "strict": true }`
 - `response_format=<Pydantic Model>`
 ```python
 from litellm import supports_response_schema
 assert supports_response_schema(model="gemini-1.5-pro-preview-0215", custom_llm_provider="bedrock")
 ```
 Check out [model_prices_and_context_window.json](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) for a full list of models and their support for `response_schema`.
 ## Pass in 'json_schema' 
 To use Structured Outputs, simply specify
--- a/litellm/_service_logger.py
+++ b/litellm/_service_logger.py
@ -7,6 +7,7 @@ from litellm._logging import verbose_logger
 from litellm.proxy._types import UserAPIKeyAuth
 from .integrations.custom_logger import CustomLogger
 from .integrations.datadog.datadog import DataDogLogger
 from .integrations.prometheus_services import PrometheusServicesLogger
 from .types.services import ServiceLoggerPayload, ServiceTypes
@ -134,9 +135,7 @@ class ServiceLogging(CustomLogger):
                await self.prometheusServicesLogger.async_service_success_hook(
                    payload=payload
                )
-            elif callback == "datadog":
+            elif callback == "datadog" or isinstance(callback, DataDogLogger):
                from litellm.integrations.datadog.datadog import DataDogLogger
                await self.init_datadog_logger_if_none()
                await self.dd_logger.async_service_success_hook(
                    payload=payload,
@ -237,6 +236,7 @@ class ServiceLogging(CustomLogger):
            duration=duration,
            call_type=call_type,
        )
        for callback in litellm.service_callback:
            if callback == "prometheus_system":
                await self.init_prometheus_services_logger_if_none()
@ -244,7 +244,7 @@ class ServiceLogging(CustomLogger):
                    payload=payload,
                    error=error,
                )
-            elif callback == "datadog":
+            elif callback == "datadog" or isinstance(callback, DataDogLogger):
                await self.init_datadog_logger_if_none()
                await self.dd_logger.async_service_failure_hook(
                    payload=payload,
--- a/litellm/llms/bedrock/chat/converse_transformation.py
+++ b/litellm/llms/bedrock/chat/converse_transformation.py
@ -5,7 +5,7 @@ Translating between OpenAI's `/chat/completion` format and Amazon's `/converse`
 import copy
 import time
 import types
-from typing import List, Optional, Union
+from typing import List, Literal, Optional, Tuple, Union, cast, overload
 import httpx
@ -255,6 +255,59 @@ class AmazonConverseConfig:
                )
        return optional_params
    @overload
    def _get_cache_point_block(
        self, message_block: dict, block_type: Literal["system"]
    ) -> Optional[SystemContentBlock]:
        pass
    @overload
    def _get_cache_point_block(
        self, message_block: dict, block_type: Literal["content_block"]
    ) -> Optional[ContentBlock]:
        pass
    def _get_cache_point_block(
        self, message_block: dict, block_type: Literal["system", "content_block"]
    ) -> Optional[Union[SystemContentBlock, ContentBlock]]:
        if message_block.get("cache_control", None) is None:
            return None
        if block_type == "system":
            return SystemContentBlock(cachePoint=CachePointBlock(type="default"))
        else:
            return ContentBlock(cachePoint=CachePointBlock(type="default"))
    def _transform_system_message(
        self, messages: List[AllMessageValues]
    ) -> Tuple[List[AllMessageValues], List[SystemContentBlock]]:
        system_prompt_indices = []
        system_content_blocks: List[SystemContentBlock] = []
        for idx, message in enumerate(messages):
            if message["role"] == "system":
                _system_content_block: Optional[SystemContentBlock] = None
                _cache_point_block: Optional[SystemContentBlock] = None
                if isinstance(message["content"], str) and len(message["content"]) > 0:
                    _system_content_block = SystemContentBlock(text=message["content"])
                    _cache_point_block = self._get_cache_point_block(
                        cast(dict, message), block_type="system"
                    )
                elif isinstance(message["content"], list):
                    for m in message["content"]:
                        if m.get("type", "") == "text" and len(m["text"]) > 0:
                            _system_content_block = SystemContentBlock(text=m["text"])
                            _cache_point_block = self._get_cache_point_block(
                                m, block_type="system"
                            )
                if _system_content_block is not None:
                    system_content_blocks.append(_system_content_block)
                if _cache_point_block is not None:
                    system_content_blocks.append(_cache_point_block)
                system_prompt_indices.append(idx)
        if len(system_prompt_indices) > 0:
            for idx in reversed(system_prompt_indices):
                messages.pop(idx)
        return messages, system_content_blocks
    def _transform_request(
        self,
        model: str,
@ -262,24 +315,7 @@ class AmazonConverseConfig:
        optional_params: dict,
        litellm_params: dict,
    ) -> RequestObject:
-        system_prompt_indices = []
+        messages, system_content_blocks = self._transform_system_message(messages)
        system_content_blocks: List[SystemContentBlock] = []
        for idx, message in enumerate(messages):
            if message["role"] == "system":
                _system_content_block: Optional[SystemContentBlock] = None
                if isinstance(message["content"], str) and len(message["content"]) > 0:
                    _system_content_block = SystemContentBlock(text=message["content"])
                elif isinstance(message["content"], list):
                    for m in message["content"]:
                        if m.get("type", "") == "text" and len(m["text"]) > 0:
                            _system_content_block = SystemContentBlock(text=m["text"])
                if _system_content_block is not None:
                    system_content_blocks.append(_system_content_block)
                system_prompt_indices.append(idx)
        if len(system_prompt_indices) > 0:
            for idx in reversed(system_prompt_indices):
                messages.pop(idx)
        inference_params = copy.deepcopy(optional_params)
        additional_request_keys = []
        additional_request_params = {}
--- a/litellm/llms/bedrock/rerank/handler.py
+++ b/litellm/llms/bedrock/rerank/handler.py
@ -0,0 +1,159 @@
 import copy
 import json
 import os
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast
 import httpx
 from openai.types.image import Image
 from pydantic import BaseModel
 import litellm
 from litellm._logging import verbose_logger
 from litellm.litellm_core_utils.litellm_logging import Logging as LitellmLogging
 from litellm.llms.custom_httpx.http_handler import (
    _get_httpx_client,
    get_async_httpx_client,
 )
 from litellm.types.llms.bedrock import BedrockPreparedRequest, BedrockRerankRequest
 from litellm.types.rerank import RerankRequest
 from litellm.types.utils import RerankResponse
 from ...base_aws_llm import BaseAWSLLM
 from ..common_utils import BedrockError
 from .transformation import BedrockRerankConfig
 if TYPE_CHECKING:
    from botocore.awsrequest import AWSPreparedRequest
 else:
    AWSPreparedRequest = Any
 class BedrockRerankHandler(BaseAWSLLM):
    async def arerank(
        self,
        prepared_request: BedrockPreparedRequest,
    ):
        client = get_async_httpx_client(llm_provider=litellm.LlmProviders.BEDROCK)
        try:
            response = await client.post(url=prepared_request["endpoint_url"], headers=prepared_request["prepped"].headers, data=prepared_request["body"])  # type: ignore
            response.raise_for_status()
        except httpx.HTTPStatusError as err:
            error_code = err.response.status_code
            raise BedrockError(status_code=error_code, message=err.response.text)
        except httpx.TimeoutException:
            raise BedrockError(status_code=408, message="Timeout error occurred.")
        return BedrockRerankConfig()._transform_response(response.json())
    def rerank(
        self,
        model: str,
        query: str,
        documents: List[Union[str, Dict[str, Any]]],
        optional_params: dict,
        logging_obj: LitellmLogging,
        top_n: Optional[int] = None,
        rank_fields: Optional[List[str]] = None,
        return_documents: Optional[bool] = True,
        max_chunks_per_doc: Optional[int] = None,
        _is_async: Optional[bool] = False,
        api_base: Optional[str] = None,
        extra_headers: Optional[dict] = None,
    ) -> RerankResponse:
        request_data = RerankRequest(
            model=model,
            query=query,
            documents=documents,
            top_n=top_n,
            rank_fields=rank_fields,
            return_documents=return_documents,
        )
        data = BedrockRerankConfig()._transform_request(request_data)
        prepared_request = self._prepare_request(
            optional_params=optional_params,
            api_base=api_base,
            extra_headers=extra_headers,
            data=cast(dict, data),
        )
        logging_obj.pre_call(
            input=data,
            api_key="",
            additional_args={
                "complete_input_dict": data,
                "api_base": prepared_request["endpoint_url"],
                "headers": prepared_request["prepped"].headers,
            },
        )
        if _is_async:
            return self.arerank(prepared_request)  # type: ignore
        client = _get_httpx_client()
        try:
            response = client.post(url=prepared_request["endpoint_url"], headers=prepared_request["prepped"].headers, data=prepared_request["body"])  # type: ignore
            response.raise_for_status()
        except httpx.HTTPStatusError as err:
            error_code = err.response.status_code
            raise BedrockError(status_code=error_code, message=err.response.text)
        except httpx.TimeoutException:
            raise BedrockError(status_code=408, message="Timeout error occurred.")
        return BedrockRerankConfig()._transform_response(response.json())
    def _prepare_request(
        self,
        api_base: Optional[str],
        extra_headers: Optional[dict],
        data: dict,
        optional_params: dict,
    ) -> BedrockPreparedRequest:
        try:
            import boto3
            from botocore.auth import SigV4Auth
            from botocore.awsrequest import AWSRequest
            from botocore.credentials import Credentials
        except ImportError:
            raise ImportError("Missing boto3 to call bedrock. Run 'pip install boto3'.")
        boto3_credentials_info = self._get_boto_credentials_from_optional_params(
            optional_params
        )
        ### SET RUNTIME ENDPOINT ###
        _, proxy_endpoint_url = self.get_runtime_endpoint(
            api_base=api_base,
            aws_bedrock_runtime_endpoint=boto3_credentials_info.aws_bedrock_runtime_endpoint,
            aws_region_name=boto3_credentials_info.aws_region_name,
        )
        proxy_endpoint_url = proxy_endpoint_url.replace(
            "bedrock-runtime", "bedrock-agent-runtime"
        )
        proxy_endpoint_url = f"{proxy_endpoint_url}/rerank"
        sigv4 = SigV4Auth(
            boto3_credentials_info.credentials,
            "bedrock",
            boto3_credentials_info.aws_region_name,
        )
        # Make POST Request
        body = json.dumps(data).encode("utf-8")
        headers = {"Content-Type": "application/json"}
        if extra_headers is not None:
            headers = {"Content-Type": "application/json", **extra_headers}
        request = AWSRequest(
            method="POST", url=proxy_endpoint_url, data=body, headers=headers
        )
        sigv4.add_auth(request)
        if (
            extra_headers is not None and "Authorization" in extra_headers
        ):  # prevent sigv4 from overwriting the auth header
            request.headers["Authorization"] = extra_headers["Authorization"]
        prepped = request.prepare()
        return BedrockPreparedRequest(
            endpoint_url=proxy_endpoint_url,
            prepped=prepped,
            body=body,
            data=data,
        )
--- a/litellm/llms/bedrock/rerank/transformation.py
+++ b/litellm/llms/bedrock/rerank/transformation.py
@ -0,0 +1,117 @@
 """
 Translates from Cohere's `/v1/rerank` input format to Bedrock's `/rerank` input format.
 Why separate file? Make it easy to see how transformation works
 """
 import uuid
 from typing import List, Optional, Union
 from litellm.types.llms.bedrock import (
    BedrockRerankBedrockRerankingConfiguration,
    BedrockRerankConfiguration,
    BedrockRerankInlineDocumentSource,
    BedrockRerankModelConfiguration,
    BedrockRerankQuery,
    BedrockRerankRequest,
    BedrockRerankSource,
    BedrockRerankTextDocument,
    BedrockRerankTextQuery,
 )
 from litellm.types.rerank import (
    RerankBilledUnits,
    RerankRequest,
    RerankResponse,
    RerankResponseMeta,
    RerankResponseResult,
    RerankTokens,
 )
 class BedrockRerankConfig:
    def _transform_sources(
        self, documents: List[Union[str, dict]]
    ) -> List[BedrockRerankSource]:
        """
        Transform the sources from RerankRequest format to Bedrock format.
        """
        _sources = []
        for document in documents:
            if isinstance(document, str):
                _sources.append(
                    BedrockRerankSource(
                        inlineDocumentSource=BedrockRerankInlineDocumentSource(
                            textDocument=BedrockRerankTextDocument(text=document),
                            type="TEXT",
                        ),
                        type="INLINE",
                    )
                )
            else:
                _sources.append(
                    BedrockRerankSource(
                        inlineDocumentSource=BedrockRerankInlineDocumentSource(
                            jsonDocument=document, type="JSON"
                        ),
                        type="INLINE",
                    )
                )
        return _sources
    def _transform_request(self, request_data: RerankRequest) -> BedrockRerankRequest:
        """
        Transform the request from RerankRequest format to Bedrock format.
        """
        _sources = self._transform_sources(request_data.documents)
        return BedrockRerankRequest(
            queries=[
                BedrockRerankQuery(
                    textQuery=BedrockRerankTextQuery(text=request_data.query),
                    type="TEXT",
                )
            ],
            rerankingConfiguration=BedrockRerankConfiguration(
                bedrockRerankingConfiguration=BedrockRerankBedrockRerankingConfiguration(
                    modelConfiguration=BedrockRerankModelConfiguration(
                        modelArn=request_data.model
                    ),
                    numberOfResults=request_data.top_n or len(request_data.documents),
                ),
                type="BEDROCK_RERANKING_MODEL",
            ),
            sources=_sources,
        )
    def _transform_response(self, response: dict) -> RerankResponse:
        """
        Transform the response from Bedrock into the RerankResponse format.
        example input:
        {"results":[{"index":0,"relevanceScore":0.6847912669181824},{"index":1,"relevanceScore":0.5980774760246277}]}
        """
        _billed_units = RerankBilledUnits(**response.get("usage", {}))
        _tokens = RerankTokens(**response.get("usage", {}))
        rerank_meta = RerankResponseMeta(billed_units=_billed_units, tokens=_tokens)
        _results: Optional[List[RerankResponseResult]] = None
        bedrock_results = response.get("results")
        if bedrock_results:
            _results = [
                RerankResponseResult(
                    index=result.get("index"),
                    relevance_score=result.get("relevanceScore"),
                )
                for result in bedrock_results
            ]
        if _results is None:
            raise ValueError(f"No results found in the response={response}")
        return RerankResponse(
            id=response.get("id") or str(uuid.uuid4()),
            results=_results,
            meta=rerank_meta,
        )  # Return response
--- a/litellm/llms/jina_ai/rerank/transformation.py
+++ b/litellm/llms/jina_ai/rerank/transformation.py
@ -31,6 +31,6 @@ class JinaAIRerankConfig:
        return RerankResponse(
            id=response.get("id") or str(uuid.uuid4()),
-            results=_results,
+            results=_results,  # type: ignore
            meta=rerank_meta,
        )  # Return response
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -2485,10 +2485,24 @@ def _bedrock_converse_messages_pt(  # noqa: PLR0915
                                image_url=image_url
                            )
                            _parts.append(_part)  # type: ignore
                        _cache_point_block = (
                            litellm.AmazonConverseConfig()._get_cache_point_block(
                                element, block_type="content_block"
                            )
                        )
                        if _cache_point_block is not None:
                            _parts.append(_cache_point_block)
                user_content.extend(_parts)
            else:
                _part = BedrockContentBlock(text=messages[msg_i]["content"])
                _cache_point_block = (
                    litellm.AmazonConverseConfig()._get_cache_point_block(
                        messages[msg_i], block_type="content_block"
                    )
                )
                user_content.append(_part)
                if _cache_point_block is not None:
                    user_content.append(_cache_point_block)
            msg_i += 1
        if user_content:
--- a/litellm/llms/together_ai/chat.py
+++ b/litellm/llms/together_ai/chat.py
@ -6,8 +6,54 @@ Calls done in OpenAI/openai.py as TogetherAI is openai-compatible.
 Docs: https://docs.together.ai/reference/completions-1
 """
 from typing import Optional
 from litellm import get_model_info, verbose_logger
 from ..OpenAI.chat.gpt_transformation import OpenAIGPTConfig
 class TogetherAIConfig(OpenAIGPTConfig):
-    pass
+    def get_supported_openai_params(self, model: str) -> list:
        """
        Only some together models support response_format / tool calling
        Docs: https://docs.together.ai/docs/json-mode
        """
        supports_function_calling: Optional[bool] = None
        try:
            model_info = get_model_info(model, custom_llm_provider="together_ai")
            supports_function_calling = model_info.get(
                "supports_function_calling", False
            )
        except Exception as e:
            verbose_logger.debug(f"Error getting supported openai params: {e}")
            pass
        optional_params = super().get_supported_openai_params(model)
        if supports_function_calling is not True:
            verbose_logger.warning(
                "Only some together models support function calling/response_format. Docs - https://docs.together.ai/docs/function-calling"
            )
            optional_params.remove("tools")
            optional_params.remove("tool_choice")
            optional_params.remove("function_call")
            optional_params.remove("response_format")
        return optional_params
    def map_openai_params(
        self,
        non_default_params: dict,
        optional_params: dict,
        model: str,
        drop_params: bool,
    ) -> dict:
        mapped_openai_params = super().map_openai_params(
            non_default_params, optional_params, model, drop_params
        )
        if "response_format" in mapped_openai_params and mapped_openai_params[
            "response_format"
        ] == {"type": "text"}:
            mapped_openai_params.pop("response_format")
        return mapped_openai_params
--- a/litellm/llms/together_ai/rerank/transformation.py
+++ b/litellm/llms/together_ai/rerank/transformation.py
@ -29,6 +29,6 @@ class TogetherAIRerankConfig:
        return RerankResponse(
            id=response.get("id") or str(uuid.uuid4()),
-            results=_results,
+            results=_results,  # type: ignore
            meta=rerank_meta,
        )  # Return response
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -12,7 +12,8 @@
        "supports_vision": true,
        "supports_audio_input": true, 
        "supports_audio_output": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
        "supports_response_schema": true
    },
    "gpt-4": {
        "max_tokens": 4096, 
@ -4818,7 +4819,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true,
-        "supports_pdf_input": true
+        "supports_pdf_input": true, 
        "supports_prompt_caching": true
    },
    "amazon.nova-lite-v1:0": {
        "max_tokens": 4096, 
@ -4830,7 +4832,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true,
-        "supports_pdf_input": true
+        "supports_pdf_input": true,
        "supports_prompt_caching": true
    },
    "amazon.nova-pro-v1:0": {
        "max_tokens": 4096, 
@ -4842,7 +4845,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true,
-        "supports_pdf_input": true
+        "supports_pdf_input": true,
        "supports_prompt_caching": true
    },
    "anthropic.claude-3-sonnet-20240229-v1:0": {
        "max_tokens": 4096, 
@ -4876,7 +4880,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true,
-        "supports_assistant_prefill": true
+        "supports_assistant_prefill": true,
        "supports_prompt_caching": true
    },
    "anthropic.claude-3-haiku-20240307-v1:0": {
        "max_tokens": 4096, 
@ -4898,7 +4903,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_assistant_prefill": true,
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_prompt_caching": true
    },
    "anthropic.claude-3-opus-20240229-v1:0": {
        "max_tokens": 4096,
@ -4911,139 +4917,6 @@
        "supports_function_calling": true,
        "supports_vision": true
    },
    "us.anthropic.claude-3-sonnet-20240229-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true
    },
    "us.anthropic.claude-3-5-sonnet-20240620-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true
    },
    "us.anthropic.claude-3-5-sonnet-20241022-v2:0": {
        "max_tokens": 8192,
        "max_input_tokens": 200000,
        "max_output_tokens": 8192,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_assistant_prefill": true
    },
    "us.anthropic.claude-3-haiku-20240307-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000025,
        "output_cost_per_token": 0.00000125,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true
    },
    "us.anthropic.claude-3-5-haiku-20241022-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000001,
        "output_cost_per_token": 0.000005,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_assistant_prefill": true,
        "supports_function_calling": true
    },
    "us.anthropic.claude-3-opus-20240229-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000015,
        "output_cost_per_token": 0.000075,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true
    },
    "eu.anthropic.claude-3-sonnet-20240229-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true
    },
    "eu.anthropic.claude-3-5-sonnet-20240620-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true
    },
    "eu.anthropic.claude-3-5-sonnet-20241022-v2:0": {
        "max_tokens": 8192,
        "max_input_tokens": 200000,
        "max_output_tokens": 8192,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_assistant_prefill": true
    },
    "eu.anthropic.claude-3-haiku-20240307-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000025,
        "output_cost_per_token": 0.00000125,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true
    },
    "eu.anthropic.claude-3-5-haiku-20241022-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000001,
        "output_cost_per_token": 0.000005,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true
    },
    "eu.anthropic.claude-3-opus-20240229-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000015,
        "output_cost_per_token": 0.000075,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true
    },
    "anthropic.claude-v1": {
        "max_tokens": 8191, 
        "max_input_tokens": 100000,
@ -6097,6 +5970,30 @@
        "litellm_provider": "together_ai",
        "mode": "embedding"
    },
    "together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo": {
        "input_cost_per_token": 0.00000018,
        "output_cost_per_token": 0.00000018,
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "mode": "chat"
    },
    "together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo": {
        "input_cost_per_token": 0.00000088,
        "output_cost_per_token": 0.00000088,
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "mode": "chat"
    },
    "together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo": {
        "input_cost_per_token": 0.0000035,
        "output_cost_per_token": 0.0000035,
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "mode": "chat"
    },
    "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
        "input_cost_per_token": 0.0000006,
        "output_cost_per_token": 0.0000006,
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -39,10 +39,10 @@ model_list:
      access_groups: ["private-openai-models"] 
 router_settings:
-  routing_strategy: usage-based-routing-v2
+  # routing_strategy: usage-based-routing-v2
  #redis_url: "os.environ/REDIS_URL"
  redis_host: "os.environ/REDIS_HOST"
  redis_port: "os.environ/REDIS_PORT"
 litellm_settings:
-  success_callback: ["langsmith"]
+  callbacks: ["datadog"]
--- a/litellm/rerank_api/main.py
+++ b/litellm/rerank_api/main.py
@ -7,6 +7,7 @@ import litellm
 from litellm._logging import verbose_logger
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.azure_ai.rerank import AzureAIRerank
 from litellm.llms.bedrock.rerank.handler import BedrockRerankHandler
 from litellm.llms.cohere.rerank import CohereRerank
 from litellm.llms.jina_ai.rerank.handler import JinaAIRerank
 from litellm.llms.together_ai.rerank.handler import TogetherAIRerank
@ -21,6 +22,7 @@ cohere_rerank = CohereRerank()
 together_rerank = TogetherAIRerank()
 azure_ai_rerank = AzureAIRerank()
 jina_ai_rerank = JinaAIRerank()
 bedrock_rerank = BedrockRerankHandler()
 #################################################
@ -70,7 +72,7 @@ async def arerank(
@client
-def rerank(
+def rerank(  # noqa: PLR0915
    model: str,
    query: str,
    documents: List[Union[str, Dict[str, Any]]],
@ -268,6 +270,27 @@ def rerank(
                max_chunks_per_doc=max_chunks_per_doc,
                _is_async=_is_async,
            )
        elif _custom_llm_provider == "bedrock":
            api_base = (
                dynamic_api_base
                or optional_params.api_base
                or litellm.api_base
                or get_secret("BEDROCK_API_BASE")  # type: ignore
            )
            response = bedrock_rerank.rerank(
                model=model,
                query=query,
                documents=documents,
                top_n=top_n,
                rank_fields=rank_fields,
                return_documents=return_documents,
                max_chunks_per_doc=max_chunks_per_doc,
                _is_async=_is_async,
                optional_params=optional_params.model_dump(exclude_unset=True),
                api_base=api_base,
                logging_obj=litellm_logging_obj,
            )
        else:
            raise ValueError(f"Unsupported provider: {_custom_llm_provider}")
--- a/litellm/types/llms/bedrock.py
+++ b/litellm/types/llms/bedrock.py
@ -2,6 +2,7 @@ import json
 from typing import Any, List, Literal, Optional, TypedDict, Union
 from typing_extensions import (
    TYPE_CHECKING,
    Protocol,
    Required,
    Self,
@ -14,8 +15,13 @@ from typing_extensions import (
 from .openai import ChatCompletionToolCallChunk
-class SystemContentBlock(TypedDict):
+class CachePointBlock(TypedDict, total=False):
    type: Literal["default"]
 class SystemContentBlock(TypedDict, total=False):
    text: str
    cachePoint: CachePointBlock
 class SourceBlock(TypedDict):
@ -58,6 +64,7 @@ class ContentBlock(TypedDict, total=False):
    document: DocumentBlock
    toolResult: ToolResultBlock
    toolUse: ToolUseBlock
    cachePoint: CachePointBlock
 class MessageBlock(TypedDict):
@ -312,3 +319,71 @@ class AmazonStability3TextToImageResponse(TypedDict, total=False):
    images: List[str]
    seeds: List[str]
    finish_reasons: List[str]
 if TYPE_CHECKING:
    from botocore.awsrequest import AWSPreparedRequest
 else:
    AWSPreparedRequest = Any
 from pydantic import BaseModel
 class BedrockPreparedRequest(TypedDict):
    """
    Internal/Helper class for preparing the request for bedrock image generation
    """
    endpoint_url: str
    prepped: AWSPreparedRequest
    body: bytes
    data: dict
 class BedrockRerankTextQuery(TypedDict):
    text: str
 class BedrockRerankQuery(TypedDict):
    textQuery: BedrockRerankTextQuery
    type: Literal["TEXT"]
 class BedrockRerankModelConfiguration(TypedDict, total=False):
    modelArn: Required[str]
    modelConfiguration: dict
 class BedrockRerankBedrockRerankingConfiguration(TypedDict):
    modelConfiguration: BedrockRerankModelConfiguration
    numberOfResults: int
 class BedrockRerankConfiguration(TypedDict):
    bedrockRerankingConfiguration: BedrockRerankBedrockRerankingConfiguration
    type: Literal["BEDROCK_RERANKING_MODEL"]
 class BedrockRerankTextDocument(TypedDict, total=False):
    text: str
 class BedrockRerankInlineDocumentSource(TypedDict, total=False):
    jsonDocument: dict
    textDocument: BedrockRerankTextDocument
    type: Literal["TEXT", "JSON"]
 class BedrockRerankSource(TypedDict):
    inlineDocumentSource: BedrockRerankInlineDocumentSource
    type: Literal["INLINE"]
 class BedrockRerankRequest(TypedDict):
    """
    Request for Bedrock Rerank API
    """
    queries: List[BedrockRerankQuery]
    rerankingConfiguration: BedrockRerankConfiguration
    sources: List[BedrockRerankSource]
--- a/litellm/types/rerank.py
+++ b/litellm/types/rerank.py
@ -36,9 +36,14 @@ class RerankResponseMeta(TypedDict, total=False):
    tokens: RerankTokens
 class RerankResponseResult(TypedDict):
    index: int
    relevance_score: float
 class RerankResponse(BaseModel):
    id: str
-    results: List[dict]  # Contains index and relevance_score
+    results: List[RerankResponseResult]  # Contains index and relevance_score
    meta: Optional[RerankResponseMeta] = None  # Contains api_version and billed_units
    # Define private attributes using PrivateAttr
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -1874,22 +1874,11 @@ def supports_prompt_caching(
    Raises:
    Exception: If the given model is not found or there's an error in retrieval.
    """
-    try:
+    return _supports_factory(
-        model, custom_llm_provider, _, _ = litellm.get_llm_provider(
+        model=model,
-            model=model, custom_llm_provider=custom_llm_provider
+        custom_llm_provider=custom_llm_provider,
-        )
+        key="supports_prompt_caching",
-
+    )
        model_info = litellm.get_model_info(
            model=model, custom_llm_provider=custom_llm_provider
        )
        if model_info.get("supports_prompt_caching", False) is True:
            return True
        return False
    except Exception as e:
        raise Exception(
            f"Model not found or error in checking prompt caching support. You passed model={model}, custom_llm_provider={custom_llm_provider}. Error: {str(e)}"
        )
 def supports_vision(model: str, custom_llm_provider: Optional[str] = None) -> bool:
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -12,7 +12,8 @@
        "supports_vision": true,
        "supports_audio_input": true, 
        "supports_audio_output": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
        "supports_response_schema": true
    },
    "gpt-4": {
        "max_tokens": 4096, 
@ -4818,7 +4819,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true,
-        "supports_pdf_input": true
+        "supports_pdf_input": true, 
        "supports_prompt_caching": true
    },
    "amazon.nova-lite-v1:0": {
        "max_tokens": 4096, 
@ -4830,7 +4832,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true,
-        "supports_pdf_input": true
+        "supports_pdf_input": true,
        "supports_prompt_caching": true
    },
    "amazon.nova-pro-v1:0": {
        "max_tokens": 4096, 
@ -4842,7 +4845,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true,
-        "supports_pdf_input": true
+        "supports_pdf_input": true,
        "supports_prompt_caching": true
    },
    "anthropic.claude-3-sonnet-20240229-v1:0": {
        "max_tokens": 4096, 
@ -4876,7 +4880,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true,
-        "supports_assistant_prefill": true
+        "supports_assistant_prefill": true,
        "supports_prompt_caching": true
    },
    "anthropic.claude-3-haiku-20240307-v1:0": {
        "max_tokens": 4096, 
@ -4898,7 +4903,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_assistant_prefill": true,
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_prompt_caching": true
    },
    "anthropic.claude-3-opus-20240229-v1:0": {
        "max_tokens": 4096,
@ -4911,139 +4917,6 @@
        "supports_function_calling": true,
        "supports_vision": true
    },
    "us.anthropic.claude-3-sonnet-20240229-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true
    },
    "us.anthropic.claude-3-5-sonnet-20240620-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true
    },
    "us.anthropic.claude-3-5-sonnet-20241022-v2:0": {
        "max_tokens": 8192,
        "max_input_tokens": 200000,
        "max_output_tokens": 8192,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_assistant_prefill": true
    },
    "us.anthropic.claude-3-haiku-20240307-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000025,
        "output_cost_per_token": 0.00000125,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true
    },
    "us.anthropic.claude-3-5-haiku-20241022-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000001,
        "output_cost_per_token": 0.000005,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_assistant_prefill": true,
        "supports_function_calling": true
    },
    "us.anthropic.claude-3-opus-20240229-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000015,
        "output_cost_per_token": 0.000075,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true
    },
    "eu.anthropic.claude-3-sonnet-20240229-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true
    },
    "eu.anthropic.claude-3-5-sonnet-20240620-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true
    },
    "eu.anthropic.claude-3-5-sonnet-20241022-v2:0": {
        "max_tokens": 8192,
        "max_input_tokens": 200000,
        "max_output_tokens": 8192,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_assistant_prefill": true
    },
    "eu.anthropic.claude-3-haiku-20240307-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000025,
        "output_cost_per_token": 0.00000125,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true
    },
    "eu.anthropic.claude-3-5-haiku-20241022-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000001,
        "output_cost_per_token": 0.000005,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true
    },
    "eu.anthropic.claude-3-opus-20240229-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000015,
        "output_cost_per_token": 0.000075,
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true
    },
    "anthropic.claude-v1": {
        "max_tokens": 8191, 
        "max_input_tokens": 100000,
@ -6097,6 +5970,30 @@
        "litellm_provider": "together_ai",
        "mode": "embedding"
    },
    "together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo": {
        "input_cost_per_token": 0.00000018,
        "output_cost_per_token": 0.00000018,
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "mode": "chat"
    },
    "together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo": {
        "input_cost_per_token": 0.00000088,
        "output_cost_per_token": 0.00000088,
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "mode": "chat"
    },
    "together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo": {
        "input_cost_per_token": 0.0000035,
        "output_cost_per_token": 0.0000035,
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "mode": "chat"
    },
    "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
        "input_cost_per_token": 0.0000006,
        "output_cost_per_token": 0.0000006,
--- a/tests/llm_translation/base_llm_unit_tests.py
+++ b/tests/llm_translation/base_llm_unit_tests.py
@ -23,6 +23,34 @@ from litellm.utils import (
 from abc import ABC, abstractmethod
 def _usage_format_tests(usage: litellm.Usage):
    """
    OpenAI prompt caching
    - prompt_tokens = sum of non-cache hit tokens + cache-hit tokens
    - total_tokens = prompt_tokens + completion_tokens
    Example
    ```
    "usage": {
        "prompt_tokens": 2006,
        "completion_tokens": 300,
        "total_tokens": 2306,
        "prompt_tokens_details": {
            "cached_tokens": 1920
        },
        "completion_tokens_details": {
            "reasoning_tokens": 0
        }
        # ANTHROPIC_ONLY #
        "cache_creation_input_tokens": 0
    }
    ```
    """
    assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens
    assert usage.prompt_tokens > usage.prompt_tokens_details.cached_tokens
 class BaseLLMChatTest(ABC):
    """
    Abstract base test class that enforces a common test across all test classes.
@ -273,6 +301,78 @@ class BaseLLMChatTest(ABC):
        response = litellm.completion(**base_completion_call_args, messages=messages)
        assert response is not None
    def test_prompt_caching(self):
        litellm.set_verbose = True
        from litellm.utils import supports_prompt_caching
        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        base_completion_call_args = self.get_base_completion_call_args()
        if not supports_prompt_caching(base_completion_call_args["model"], None):
            print("Model does not support prompt caching")
            pytest.skip("Model does not support prompt caching")
        try:
            for _ in range(2):
                response = litellm.completion(
                    **base_completion_call_args,
                    messages=[
                        # System Message
                        {
                            "role": "system",
                            "content": [
                                {
                                    "type": "text",
                                    "text": "Here is the full text of a complex legal agreement"
                                    * 400,
                                    "cache_control": {"type": "ephemeral"},
                                }
                            ],
                        },
                        # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": "What are the key terms and conditions in this agreement?",
                                    "cache_control": {"type": "ephemeral"},
                                }
                            ],
                        },
                        {
                            "role": "assistant",
                            "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
                        },
                        # The final turn is marked with cache-control, for continuing in followups.
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": "What are the key terms and conditions in this agreement?",
                                    "cache_control": {"type": "ephemeral"},
                                }
                            ],
                        },
                    ],
                    temperature=0.2,
                    max_tokens=10,
                )
                _usage_format_tests(response.usage)
            print("response=", response)
            print("response.usage=", response.usage)
            _usage_format_tests(response.usage)
            assert "prompt_tokens_details" in response.usage
            assert response.usage.prompt_tokens_details.cached_tokens > 0
        except litellm.InternalServerError:
            pass
    @pytest.fixture
    def pdf_messages(self):
        import base64
--- a/tests/llm_translation/base_rerank_unit_tests.py
+++ b/tests/llm_translation/base_rerank_unit_tests.py
@ -79,6 +79,7 @@ class BaseLLMRerankTest(ABC):
    @pytest.mark.asyncio()
    @pytest.mark.parametrize("sync_mode", [True, False])
    async def test_basic_rerank(self, sync_mode):
        litellm.set_verbose = True
        rerank_call_args = self.get_base_rerank_call_args()
        custom_llm_provider = self.get_custom_llm_provider()
        if sync_mode is True:
@ -86,7 +87,7 @@ class BaseLLMRerankTest(ABC):
                **rerank_call_args,
                query="hello",
                documents=["hello", "world"],
-                top_n=3,
+                top_n=2,
            )
            print("re rank response: ", response)
@ -102,7 +103,7 @@ class BaseLLMRerankTest(ABC):
                **rerank_call_args,
                query="hello",
                documents=["hello", "world"],
-                top_n=3,
+                top_n=2,
            )
            print("async re rank response: ", response)
--- a/tests/llm_translation/test_anthropic_completion.py
+++ b/tests/llm_translation/test_anthropic_completion.py
@ -666,7 +666,7 @@ from litellm import completion
 class TestAnthropicCompletion(BaseLLMChatTest):
    def get_base_completion_call_args(self) -> dict:
-        return {"model": "claude-3-haiku-20240307"}
+        return {"model": "anthropic/claude-3-5-sonnet-20240620"}
    def test_tool_call_no_arguments(self, tool_call_no_arguments):
        """Test that tool calls with no arguments is translated correctly. Relevant issue: https://github.com/BerriAI/litellm/issues/6833"""
--- a/tests/llm_translation/test_bedrock_completion.py
+++ b/tests/llm_translation/test_bedrock_completion.py
@ -1,3 +1,7 @@
 """
 Tests Bedrock Completion + Rerank endpoints
 """
 # @pytest.mark.skip(reason="AWS Suspended Account")
 import os
 import sys
@ -31,6 +35,7 @@ from litellm.llms.bedrock.chat import BedrockLLM
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.llms.prompt_templates.factory import _bedrock_tools_pt
 from base_llm_unit_tests import BaseLLMChatTest
 from base_rerank_unit_tests import BaseLLMRerankTest
 # litellm.num_retries = 3
 litellm.cache = None
@ -1971,13 +1976,67 @@ def test_bedrock_base_model_helper():
    assert model == "us.amazon.nova-pro-v1:0"
@pytest.mark.parametrize(
    "messages, expected_cache_control",
    [
        (
            [  # test system prompt cache
                {
                    "role": "system",
                    "content": [
                        {
                            "type": "text",
                            "text": "You are an AI assistant tasked with analyzing legal documents.",
                        },
                        {
                            "type": "text",
                            "text": "Here is the full text of a complex legal agreement",
                            "cache_control": {"type": "ephemeral"},
                        },
                    ],
                },
                {
                    "role": "user",
                    "content": "what are the key terms and conditions in this agreement?",
                },
            ],
            True,
        ),
        (
            [  # test user prompt cache
                {
                    "role": "user",
                    "content": "what are the key terms and conditions in this agreement?",
                    "cache_control": {"type": "ephemeral"},
                },
            ],
            True,
        ),
    ],
 )
 def test_bedrock_prompt_caching_message(messages, expected_cache_control):
    import litellm
    import json
    transformed_messages = litellm.AmazonConverseConfig()._transform_request(
        model="bedrock/anthropic.claude-3-5-haiku-20241022-v1:0",
        messages=messages,
        optional_params={},
        litellm_params={},
    )
    if expected_cache_control:
        assert "cachePoint" in json.dumps(transformed_messages)
    else:
        assert "cachePoint" not in json.dumps(transformed_messages)
 class TestBedrockConverseChat(BaseLLMChatTest):
    def get_base_completion_call_args(self) -> dict:
        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        litellm.add_known_models()
        return {
-            "model": "bedrock/us.anthropic.claude-3-haiku-20240307-v1:0",
+            "model": "bedrock/anthropic.claude-3-5-haiku-20241022-v1:0",
        }
    def test_tool_call_no_arguments(self, tool_call_no_arguments):
@ -1991,3 +2050,19 @@ class TestBedrockConverseChat(BaseLLMChatTest):
        Todo: if litellm.modify_params is True ensure it's a valid utf-8 sequence
        """
        pass
    def test_prompt_caching(self):
        """
        Remove override once we have access to Bedrock prompt caching
        """
        pass
 class TestBedrockRerank(BaseLLMRerankTest):
    def get_custom_llm_provider(self) -> litellm.LlmProviders:
        return litellm.LlmProviders.BEDROCK
    def get_base_rerank_call_args(self) -> dict:
        return {
            "model": "bedrock/arn:aws:bedrock:us-west-2::foundation-model/amazon.rerank-v1:0",
        }
--- a/tests/llm_translation/test_together_ai.py
+++ b/tests/llm_translation/test_together_ai.py
@ -0,0 +1,58 @@
 """
 Test TogetherAI LLM
 """
 from base_llm_unit_tests import BaseLLMChatTest
 import json
 import os
 import sys
 from datetime import datetime
 from unittest.mock import AsyncMock
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 import pytest
 class TestTogetherAI(BaseLLMChatTest):
    def get_base_completion_call_args(self) -> dict:
        litellm.set_verbose = True
        return {"model": "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1"}
    def test_tool_call_no_arguments(self, tool_call_no_arguments):
        """Test that tool calls with no arguments is translated correctly. Relevant issue: https://github.com/BerriAI/litellm/issues/6833"""
        pass
    def test_multilingual_requests(self):
        """
        Mistral API raises a 400 BadRequest error when the request contains invalid utf-8 sequences.
        """
        pass
    @pytest.mark.parametrize(
        "model, expected_bool",
        [
            ("meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", True),
            ("nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", False),
        ],
    )
    def test_get_supported_response_format_together_ai(
        self, model: str, expected_bool: bool
    ) -> None:
        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        optional_params = litellm.get_supported_openai_params(
            model, custom_llm_provider="together_ai"
        )
        # Mapped provider
        assert isinstance(optional_params, list)
        if expected_bool:
            assert "response_format" in optional_params
            assert "tools" in optional_params
        else:
            assert "response_format" not in optional_params
            assert "tools" not in optional_params
--- a/tests/local_testing/test_caching_handler.py
+++ b/tests/local_testing/test_caching_handler.py
@ -197,7 +197,7 @@ async def test_async_log_cache_hit_on_callbacks():
        ),
        (
            CallTypes.rerank.value,
-            {"id": "test", "results": [{"index": 0, "score": 0.9}]},
+            {"id": "test", "results": [{"index": 0, "relevance_score": 0.9}]},
            RerankResponse,
        ),
        (
--- a/tests/local_testing/test_prompt_caching.py
+++ b/tests/local_testing/test_prompt_caching.py
@ -38,76 +38,6 @@ def _usage_format_tests(usage: litellm.Usage):
    assert usage.prompt_tokens > usage.prompt_tokens_details.cached_tokens
@pytest.mark.parametrize(
    "model",
    [
        "anthropic/claude-3-5-sonnet-20240620",
        # "openai/gpt-4o",
        # "deepseek/deepseek-chat",
    ],
 )
 def test_prompt_caching_model(model):
    try:
        for _ in range(2):
            response = litellm.completion(
                model=model,
                messages=[
                    # System Message
                    {
                        "role": "system",
                        "content": [
                            {
                                "type": "text",
                                "text": "Here is the full text of a complex legal agreement"
                                * 400,
                                "cache_control": {"type": "ephemeral"},
                            }
                        ],
                    },
                    # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": "What are the key terms and conditions in this agreement?",
                                "cache_control": {"type": "ephemeral"},
                            }
                        ],
                    },
                    {
                        "role": "assistant",
                        "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
                    },
                    # The final turn is marked with cache-control, for continuing in followups.
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": "What are the key terms and conditions in this agreement?",
                                "cache_control": {"type": "ephemeral"},
                            }
                        ],
                    },
                ],
                temperature=0.2,
                max_tokens=10,
            )
            _usage_format_tests(response.usage)
        print("response=", response)
        print("response.usage=", response.usage)
        _usage_format_tests(response.usage)
        assert "prompt_tokens_details" in response.usage
        assert response.usage.prompt_tokens_details.cached_tokens > 0
    except litellm.InternalServerError:
        pass
 def test_supports_prompt_caching():
    from litellm.utils import supports_prompt_caching
--- a/tests/logging_callback_tests/test_log_db_redis_services.py
+++ b/tests/logging_callback_tests/test_log_db_redis_services.py
@ -185,3 +185,22 @@ async def test_log_db_metrics_failure_error_types(exception, should_log):
        else:
            # Assert failure was NOT logged for non-DB errors
            mock_proxy_logging.service_logging_obj.async_service_failure_hook.assert_not_called()
@pytest.mark.asyncio
 async def test_dd_log_db_spend_failure_metrics():
    from litellm._service_logger import ServiceLogging
    from litellm.integrations.datadog.datadog import DataDogLogger
    dd_logger = DataDogLogger()
    with patch.object(dd_logger, "async_service_failure_hook", new_callable=AsyncMock):
        service_logging_obj = ServiceLogging()
        litellm.service_callback = [dd_logger]
        await service_logging_obj.async_service_failure_hook(
            service=ServiceTypes.DB,
            call_type="test_call_type",
            error="test_error",
            duration=1.0,
        )