Add cost tracking for rerank via bedrock (#8691)

* feat(bedrock/rerank): infer model region if model given as arn * test: add unit testing to ensure bedrock region name inferred from arn on rerank * feat(bedrock/rerank/transformation.py): include search units for bedrock rerank result Resolves https://github.com/BerriAI/litellm/issues/7258#issuecomment-2671557137 * test(test_bedrock_completion.py): add testing for bedrock cohere rerank * feat(cost_calculator.py): refactor rerank cost tracking to support bedrock cost tracking * build(model_prices_and_context_window.json): add amazon.rerank model to model cost map * fix(cost_calculator.py): bedrock/common_utils.py get base model from model w/ arn -> handles rerank model * build(model_prices_and_context_window.json): add bedrock cohere rerank pricing * feat(bedrock/rerank): migrate bedrock config to basererank config * Revert "feat(bedrock/rerank): migrate bedrock config to basererank config" This reverts commit 84fae1f167. * test: add testing to ensure large doc / queries are correctly counted * Revert "test: add testing to ensure large doc / queries are correctly counted" This reverts commit 4337f1657e. * fix(migrate-jina-ai-to-rerank-config): enables cost tracking * refactor(jina_ai/): finish migrating jina ai to base rerank config enables cost tracking * fix(jina_ai/rerank): e2e jina ai rerank cost tracking * fix: cleanup dead code * fix: fix python3.8 compatibility error * test: fix test * test: add e2e testing for azure ai rerank * fix: fix linting error * test: mark cohere as flaky
2025-04-25 18:54:30 +00:00 · 2025-02-20 21:00:18 -08:00 · 2025-02-20 21:00:18 -08:00 · b682dc4ec8
commit b682dc4ec8
parent 4c9517fd78
26 changed files with 524 additions and 296 deletions
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@ -10,6 +10,7 @@ ALL Bedrock models (Anthropic, Meta, Deepseek, Mistral, Amazon, etc.) are Suppor
 | Provider Route on LiteLLM | `bedrock/`, [`bedrock/converse/`](#set-converse--invoke-route), [`bedrock/invoke/`](#set-invoke-route), [`bedrock/converse_like/`](#calling-via-internal-proxy), [`bedrock/llama/`](#deepseek-not-r1), [`bedrock/deepseek_r1/`](#deepseek-r1) |
 | Provider Doc | [Amazon Bedrock ↗](https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html) |
 | Supported OpenAI Endpoints | `/chat/completions`, `/completions`, `/embeddings`, `/images/generations` |
 | Rerank Endpoint | `/rerank` |
 | Pass-through Endpoint | [Supported](../pass_through/bedrock.md) |
--- a/litellm/init.py
+++ b/litellm/init.py
@ -820,6 +820,7 @@ from .llms.cohere.completion.transformation import CohereTextConfig as CohereCon
 from .llms.cohere.rerank.transformation import CohereRerankConfig
 from .llms.azure_ai.rerank.transformation import AzureAIRerankConfig
 from .llms.infinity.rerank.transformation import InfinityRerankConfig
 from .llms.jina_ai.rerank.transformation import JinaAIRerankConfig
 from .llms.clarifai.chat.transformation import ClarifaiConfig
 from .llms.ai21.chat.transformation import AI21ChatConfig, AI21ChatConfig as AI21Config
 from .llms.together_ai.chat import TogetherAIConfig
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -16,15 +16,9 @@ from litellm.llms.anthropic.cost_calculation import (
 from litellm.llms.azure.cost_calculation import (
    cost_per_token as azure_openai_cost_per_token,
 )
 from litellm.llms.azure_ai.cost_calculator import (
    cost_per_query as azure_ai_rerank_cost_per_query,
 )
 from litellm.llms.bedrock.image.cost_calculator import (
    cost_calculator as bedrock_image_cost_calculator,
 )
 from litellm.llms.cohere.cost_calculator import (
    cost_per_query as cohere_rerank_cost_per_query,
 )
 from litellm.llms.databricks.cost_calculator import (
    cost_per_token as databricks_cost_per_token,
 )
@ -51,10 +45,12 @@ from litellm.llms.vertex_ai.image_generation.cost_calculator import (
    cost_calculator as vertex_ai_image_cost_calculator,
 )
 from litellm.types.llms.openai import HttpxBinaryResponseContent
-from litellm.types.rerank import RerankResponse
+from litellm.types.rerank import RerankBilledUnits, RerankResponse
 from litellm.types.utils import (
    CallTypesLiteral,
    LlmProviders,
    LlmProvidersSet,
    ModelInfo,
    PassthroughCallTypes,
    Usage,
 )
@ -64,6 +60,7 @@ from litellm.utils import (
    EmbeddingResponse,
    ImageResponse,
    ModelResponse,
    ProviderConfigManager,
    TextCompletionResponse,
    TranscriptionResponse,
    _cached_get_model_info_helper,
@ -114,6 +111,8 @@ def cost_per_token(  # noqa: PLR0915
    number_of_queries: Optional[int] = None,
    ### USAGE OBJECT ###
    usage_object: Optional[Usage] = None,  # just read the usage object if provided
    ### BILLED UNITS ###
    rerank_billed_units: Optional[RerankBilledUnits] = None,
    ### CALL TYPE ###
    call_type: CallTypesLiteral = "completion",
    audio_transcription_file_duration: float = 0.0,  # for audio transcription calls - the file time in seconds
@ -238,6 +237,7 @@ def cost_per_token(  # noqa: PLR0915
        return rerank_cost(
            model=model,
            custom_llm_provider=custom_llm_provider,
            billed_units=rerank_billed_units,
        )
    elif call_type == "atranscription" or call_type == "transcription":
        return openai_cost_per_second(
@ -552,6 +552,7 @@ def completion_cost(  # noqa: PLR0915
        cost_per_token_usage_object: Optional[Usage] = _get_usage_object(
            completion_response=completion_response
        )
        rerank_billed_units: Optional[RerankBilledUnits] = None
        model = _select_model_name_for_cost_calc(
            model=model,
            completion_response=completion_response,
@ -698,6 +699,11 @@ def completion_cost(  # noqa: PLR0915
                else:
                    billed_units = {}
                rerank_billed_units = RerankBilledUnits(
                    search_units=billed_units.get("search_units"),
                    total_tokens=billed_units.get("total_tokens"),
                )
                search_units = (
                    billed_units.get("search_units") or 1
                )  # cohere charges per request by default.
@ -763,6 +769,7 @@ def completion_cost(  # noqa: PLR0915
            usage_object=cost_per_token_usage_object,
            call_type=call_type,
            audio_transcription_file_duration=audio_transcription_file_duration,
            rerank_billed_units=rerank_billed_units,
        )
        _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
@ -836,27 +843,33 @@ def response_cost_calculator(
 def rerank_cost(
    model: str,
    custom_llm_provider: Optional[str],
    billed_units: Optional[RerankBilledUnits] = None,
 ) -> Tuple[float, float]:
    """
    Returns
    - float or None: cost of response OR none if error.
    """
    default_num_queries = 1
    _, custom_llm_provider, _, _ = litellm.get_llm_provider(
        model=model, custom_llm_provider=custom_llm_provider
    )
    try:
-        if custom_llm_provider == "cohere":
+        config = ProviderConfigManager.get_provider_rerank_config(
-            return cohere_rerank_cost_per_query(
+            model=model, provider=LlmProviders(custom_llm_provider)
-                model=model, num_queries=default_num_queries
+        )
        try:
            model_info: Optional[ModelInfo] = litellm.get_model_info(
                model=model, custom_llm_provider=custom_llm_provider
            )
-        elif custom_llm_provider == "azure_ai":
+        except Exception:
-            return azure_ai_rerank_cost_per_query(
+            model_info = None
-                model=model, num_queries=default_num_queries
+
-            )
+        return config.calculate_rerank_cost(
-        raise ValueError(
+            model=model,
-            f"invalid custom_llm_provider for rerank model: {model}, custom_llm_provider: {custom_llm_provider}"
+            custom_llm_provider=custom_llm_provider,
            billed_units=billed_units,
            model_info=model_info,
        )
    except Exception as e:
        raise e
--- a/litellm/llms/azure_ai/cost_calculator.py
+++ b/litellm/llms/azure_ai/cost_calculator.py
@ -1,32 +0,0 @@
 """
 Handles custom cost calculation for Azure AI models.
 Custom cost calculation for Azure AI models only requied for rerank.
 """
 from typing import Tuple
 from litellm.utils import get_model_info
 def cost_per_query(model: str, num_queries: int = 1) -> Tuple[float, float]:
    """
    Calculates the cost per query for a given rerank model.
    Input:
        - model: str, the model name without provider prefix
    Returns:
        Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
    """
    model_info = get_model_info(model=model, custom_llm_provider="azure_ai")
    if (
        "input_cost_per_query" not in model_info
        or model_info["input_cost_per_query"] is None
    ):
        return 0.0, 0.0
    prompt_cost = model_info["input_cost_per_query"] * num_queries
    return prompt_cost, 0.0
--- a/litellm/llms/base_llm/rerank/transformation.py
+++ b/litellm/llms/base_llm/rerank/transformation.py
@ -1,9 +1,10 @@
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 import httpx
-from litellm.types.rerank import OptionalRerankParams, RerankResponse
+from litellm.types.rerank import OptionalRerankParams, RerankBilledUnits, RerankResponse
 from litellm.types.utils import ModelInfo
 from ..chat.transformation import BaseLLMException
@ -66,7 +67,7 @@ class BaseRerankConfig(ABC):
    @abstractmethod
    def map_cohere_rerank_params(
        self,
-        non_default_params: Optional[dict],
+        non_default_params: dict,
        model: str,
        drop_params: bool,
        query: str,
@ -79,8 +80,48 @@ class BaseRerankConfig(ABC):
    ) -> OptionalRerankParams:
        pass
    @abstractmethod
    def get_error_class(
        self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
    ) -> BaseLLMException:
-        pass
+        raise BaseLLMException(
            status_code=status_code,
            message=error_message,
            headers=headers,
        )
    def calculate_rerank_cost(
        self,
        model: str,
        custom_llm_provider: Optional[str] = None,
        billed_units: Optional[RerankBilledUnits] = None,
        model_info: Optional[ModelInfo] = None,
    ) -> Tuple[float, float]:
        """
        Calculates the cost per query for a given rerank model.
        Input:
            - model: str, the model name without provider prefix
            - custom_llm_provider: str, the provider used for the model. If provided, used to check if the litellm model info is for that provider.
            - num_queries: int, the number of queries to calculate the cost for
            - model_info: ModelInfo, the model info for the given model
        Returns:
            Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
        """
        if (
            model_info is None
            or "input_cost_per_query" not in model_info
            or model_info["input_cost_per_query"] is None
            or billed_units is None
        ):
            return 0.0, 0.0
        search_units = billed_units.get("search_units")
        if search_units is None:
            return 0.0, 0.0
        prompt_cost = model_info["input_cost_per_query"] * search_units
        return prompt_cost, 0.0
--- a/litellm/llms/bedrock/base_aws_llm.py
+++ b/litellm/llms/bedrock/base_aws_llm.py
@ -202,6 +202,61 @@ class BaseAWSLLM:
        self.iam_cache.set_cache(cache_key, credentials, ttl=_cache_ttl)
        return credentials
    def _get_aws_region_from_model_arn(self, model: Optional[str]) -> Optional[str]:
        try:
            # First check if the string contains the expected prefix
            if not isinstance(model, str) or "arn:aws:bedrock" not in model:
                return None
            # Split the ARN and check if we have enough parts
            parts = model.split(":")
            if len(parts) < 4:
                return None
            # Get the region from the correct position
            region = parts[3]
            if not region:  # Check if region is empty
                return None
            return region
        except Exception:
            # Catch any unexpected errors and return None
            return None
    def _get_aws_region_name(
        self, optional_params: dict, model: Optional[str] = None
    ) -> str:
        """
        Get the AWS region name from the environment variables
        """
        aws_region_name = optional_params.get("aws_region_name", None)
        ### SET REGION NAME ###
        if aws_region_name is None:
            # check model arn #
            aws_region_name = self._get_aws_region_from_model_arn(model)
            # check env #
            litellm_aws_region_name = get_secret("AWS_REGION_NAME", None)
            if (
                aws_region_name is None
                and litellm_aws_region_name is not None
                and isinstance(litellm_aws_region_name, str)
            ):
                aws_region_name = litellm_aws_region_name
            standard_aws_region_name = get_secret("AWS_REGION", None)
            if (
                aws_region_name is None
                and standard_aws_region_name is not None
                and isinstance(standard_aws_region_name, str)
            ):
                aws_region_name = standard_aws_region_name
        if aws_region_name is None:
            aws_region_name = "us-west-2"
        return aws_region_name
    @tracer.wrap()
    def _auth_with_web_identity_token(
        self,
@ -423,7 +478,7 @@ class BaseAWSLLM:
        return endpoint_url, proxy_endpoint_url
    def _get_boto_credentials_from_optional_params(
-        self, optional_params: dict
+        self, optional_params: dict, model: Optional[str] = None
    ) -> Boto3CredentialsInfo:
        """
        Get boto3 credentials from optional params
@ -443,7 +498,7 @@ class BaseAWSLLM:
        aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
        aws_access_key_id = optional_params.pop("aws_access_key_id", None)
        aws_session_token = optional_params.pop("aws_session_token", None)
-        aws_region_name = optional_params.pop("aws_region_name", None)
+        aws_region_name = self._get_aws_region_name(optional_params, model)
        aws_role_name = optional_params.pop("aws_role_name", None)
        aws_session_name = optional_params.pop("aws_session_name", None)
        aws_profile_name = optional_params.pop("aws_profile_name", None)
@ -453,25 +508,6 @@ class BaseAWSLLM:
            "aws_bedrock_runtime_endpoint", None
        )  # https://bedrock-runtime.{region_name}.amazonaws.com
        ### SET REGION NAME ###
        if aws_region_name is None:
            # check env #
            litellm_aws_region_name = get_secret_str("AWS_REGION_NAME", None)
            if litellm_aws_region_name is not None and isinstance(
                litellm_aws_region_name, str
            ):
                aws_region_name = litellm_aws_region_name
            standard_aws_region_name = get_secret_str("AWS_REGION", None)
            if standard_aws_region_name is not None and isinstance(
                standard_aws_region_name, str
            ):
                aws_region_name = standard_aws_region_name
            if aws_region_name is None:
                aws_region_name = "us-west-2"
        credentials: Credentials = self.get_credentials(
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
--- a/litellm/llms/bedrock/chat/invoke_transformations/base_invoke_transformation.py
+++ b/litellm/llms/bedrock/chat/invoke_transformations/base_invoke_transformation.py
@ -27,7 +27,7 @@ from litellm.llms.custom_httpx.http_handler import (
 )
 from litellm.types.llms.openai import AllMessageValues
 from litellm.types.utils import ModelResponse, Usage
-from litellm.utils import CustomStreamWrapper, get_secret
+from litellm.utils import CustomStreamWrapper
 if TYPE_CHECKING:
    from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
@ -598,61 +598,6 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
            )
        return modelId
    def get_aws_region_from_model_arn(self, model: Optional[str]) -> Optional[str]:
        try:
            # First check if the string contains the expected prefix
            if not isinstance(model, str) or "arn:aws:bedrock" not in model:
                return None
            # Split the ARN and check if we have enough parts
            parts = model.split(":")
            if len(parts) < 4:
                return None
            # Get the region from the correct position
            region = parts[3]
            if not region:  # Check if region is empty
                return None
            return region
        except Exception:
            # Catch any unexpected errors and return None
            return None
    def _get_aws_region_name(
        self, optional_params: dict, model: Optional[str] = None
    ) -> str:
        """
        Get the AWS region name from the environment variables
        """
        aws_region_name = optional_params.get("aws_region_name", None)
        ### SET REGION NAME ###
        if aws_region_name is None:
            # check model arn #
            aws_region_name = self.get_aws_region_from_model_arn(model)
            # check env #
            litellm_aws_region_name = get_secret("AWS_REGION_NAME", None)
            if (
                aws_region_name is None
                and litellm_aws_region_name is not None
                and isinstance(litellm_aws_region_name, str)
            ):
                aws_region_name = litellm_aws_region_name
            standard_aws_region_name = get_secret("AWS_REGION", None)
            if (
                aws_region_name is None
                and standard_aws_region_name is not None
                and isinstance(standard_aws_region_name, str)
            ):
                aws_region_name = standard_aws_region_name
        if aws_region_name is None:
            aws_region_name = "us-west-2"
        return aws_region_name
    def _get_model_id_from_model_with_spec(
        self,
        model: str,
--- a/litellm/llms/bedrock/common_utils.py
+++ b/litellm/llms/bedrock/common_utils.py
@ -318,6 +318,23 @@ class BedrockModelInfo(BaseLLMModelInfo):
    global_config = AmazonBedrockGlobalConfig()
    all_global_regions = global_config.get_all_regions()
    @staticmethod
    def extract_model_name_from_arn(model: str) -> str:
        """
        Extract the model name from an AWS Bedrock ARN.
        Returns the string after the last '/' if 'arn' is in the input string.
        Args:
            arn (str): The ARN string to parse
        Returns:
            str: The extracted model name if 'arn' is in the string,
                otherwise returns the original string
        """
        if "arn" in model.lower():
            return model.split("/")[-1]
        return model
    @staticmethod
    def get_base_model(model: str) -> str:
        """
@ -335,6 +352,8 @@ class BedrockModelInfo(BaseLLMModelInfo):
        if model.startswith("invoke/"):
            model = model.split("/", 1)[1]
        model = BedrockModelInfo.extract_model_name_from_arn(model)
        potential_region = model.split(".", 1)[0]
        alt_potential_region = model.split("/", 1)[
--- a/litellm/llms/bedrock/image/image_handler.py
+++ b/litellm/llms/bedrock/image/image_handler.py
@ -163,7 +163,7 @@ class BedrockImageGeneration(BaseAWSLLM):
        except ImportError:
            raise ImportError("Missing boto3 to call bedrock. Run 'pip install boto3'.")
        boto3_credentials_info = self._get_boto_credentials_from_optional_params(
-            optional_params
+            optional_params, model
        )
        ### SET RUNTIME ENDPOINT ###
--- a/litellm/llms/bedrock/rerank/handler.py
+++ b/litellm/llms/bedrock/rerank/handler.py
@ -6,6 +6,8 @@ import httpx
 import litellm
 from litellm.litellm_core_utils.litellm_logging import Logging as LitellmLogging
 from litellm.llms.custom_httpx.http_handler import (
    AsyncHTTPHandler,
    HTTPHandler,
    _get_httpx_client,
    get_async_httpx_client,
 )
@ -27,8 +29,10 @@ class BedrockRerankHandler(BaseAWSLLM):
    async def arerank(
        self,
        prepared_request: BedrockPreparedRequest,
        client: Optional[AsyncHTTPHandler] = None,
    ):
-        client = get_async_httpx_client(llm_provider=litellm.LlmProviders.BEDROCK)
+        if client is None:
            client = get_async_httpx_client(llm_provider=litellm.LlmProviders.BEDROCK)
        try:
            response = await client.post(url=prepared_request["endpoint_url"], headers=prepared_request["prepped"].headers, data=prepared_request["body"])  # type: ignore
            response.raise_for_status()
@ -54,7 +58,9 @@ class BedrockRerankHandler(BaseAWSLLM):
        _is_async: Optional[bool] = False,
        api_base: Optional[str] = None,
        extra_headers: Optional[dict] = None,
        client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
    ) -> RerankResponse:
        request_data = RerankRequest(
            model=model,
            query=query,
@ -66,6 +72,7 @@ class BedrockRerankHandler(BaseAWSLLM):
        data = BedrockRerankConfig()._transform_request(request_data)
        prepared_request = self._prepare_request(
            model=model,
            optional_params=optional_params,
            api_base=api_base,
            extra_headers=extra_headers,
@ -83,9 +90,10 @@ class BedrockRerankHandler(BaseAWSLLM):
        )
        if _is_async:
-            return self.arerank(prepared_request)  # type: ignore
+            return self.arerank(prepared_request, client=client if client is not None and isinstance(client, AsyncHTTPHandler) else None)  # type: ignore
-        client = _get_httpx_client()
+        if client is None or not isinstance(client, HTTPHandler):
            client = _get_httpx_client()
        try:
            response = client.post(url=prepared_request["endpoint_url"], headers=prepared_request["prepped"].headers, data=prepared_request["body"])  # type: ignore
            response.raise_for_status()
@ -95,10 +103,18 @@ class BedrockRerankHandler(BaseAWSLLM):
        except httpx.TimeoutException:
            raise BedrockError(status_code=408, message="Timeout error occurred.")
-        return BedrockRerankConfig()._transform_response(response.json())
+        logging_obj.post_call(
            original_response=response.text,
            api_key="",
        )
        response_json = response.json()
        return BedrockRerankConfig()._transform_response(response_json)
    def _prepare_request(
        self,
        model: str,
        api_base: Optional[str],
        extra_headers: Optional[dict],
        data: dict,
@ -110,7 +126,7 @@ class BedrockRerankHandler(BaseAWSLLM):
        except ImportError:
            raise ImportError("Missing boto3 to call bedrock. Run 'pip install boto3'.")
        boto3_credentials_info = self._get_boto_credentials_from_optional_params(
-            optional_params
+            optional_params, model
        )
        ### SET RUNTIME ENDPOINT ###
--- a/litellm/llms/bedrock/rerank/transformation.py
+++ b/litellm/llms/bedrock/rerank/transformation.py
@ -91,7 +91,9 @@ class BedrockRerankConfig:
        example input:
        {"results":[{"index":0,"relevanceScore":0.6847912669181824},{"index":1,"relevanceScore":0.5980774760246277}]}
        """
-        _billed_units = RerankBilledUnits(**response.get("usage", {}))
+        _billed_units = RerankBilledUnits(
            **response.get("usage", {"search_units": 1})
        )  # by default 1 search unit
        _tokens = RerankTokens(**response.get("usage", {}))
        rerank_meta = RerankResponseMeta(billed_units=_billed_units, tokens=_tokens)
--- a/litellm/llms/cohere/cost_calculator.py
+++ b/litellm/llms/cohere/cost_calculator.py
@ -1,31 +0,0 @@
 """
 Custom cost calculator for Cohere rerank models
 """
 from typing import Tuple
 from litellm.utils import get_model_info
 def cost_per_query(model: str, num_queries: int = 1) -> Tuple[float, float]:
    """
    Calculates the cost per query for a given rerank model.
    Input:
        - model: str, the model name without provider prefix
    Returns:
        Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
    """
    model_info = get_model_info(model=model, custom_llm_provider="cohere")
    if (
        "input_cost_per_query" not in model_info
        or model_info["input_cost_per_query"] is None
    ):
        return 0.0, 0.0
    prompt_cost = model_info["input_cost_per_query"] * num_queries
    return prompt_cost, 0.0
--- a/litellm/llms/jina_ai/rerank/handler.py
+++ b/litellm/llms/jina_ai/rerank/handler.py
@ -1,92 +1,3 @@
 """
-Re rank api
+HTTP calling migrated to `llm_http_handler.py`
 LiteLLM supports the re rank API format, no paramter transformation occurs
 """
 from typing import Any, Dict, List, Optional, Union
 import litellm
 from litellm.llms.base import BaseLLM
 from litellm.llms.custom_httpx.http_handler import (
    _get_httpx_client,
    get_async_httpx_client,
 )
 from litellm.llms.jina_ai.rerank.transformation import JinaAIRerankConfig
 from litellm.types.rerank import RerankRequest, RerankResponse
 class JinaAIRerank(BaseLLM):
    def rerank(
        self,
        model: str,
        api_key: str,
        query: str,
        documents: List[Union[str, Dict[str, Any]]],
        top_n: Optional[int] = None,
        rank_fields: Optional[List[str]] = None,
        return_documents: Optional[bool] = True,
        max_chunks_per_doc: Optional[int] = None,
        _is_async: Optional[bool] = False,
    ) -> RerankResponse:
        client = _get_httpx_client()
        request_data = RerankRequest(
            model=model,
            query=query,
            top_n=top_n,
            documents=documents,
            rank_fields=rank_fields,
            return_documents=return_documents,
        )
        # exclude None values from request_data
        request_data_dict = request_data.dict(exclude_none=True)
        if _is_async:
            return self.async_rerank(request_data_dict, api_key)  # type: ignore # Call async method
        response = client.post(
            "https://api.jina.ai/v1/rerank",
            headers={
                "accept": "application/json",
                "content-type": "application/json",
                "authorization": f"Bearer {api_key}",
            },
            json=request_data_dict,
        )
        if response.status_code != 200:
            raise Exception(response.text)
        _json_response = response.json()
        return JinaAIRerankConfig()._transform_response(_json_response)
    async def async_rerank(  # New async method
        self,
        request_data_dict: Dict[str, Any],
        api_key: str,
    ) -> RerankResponse:
        client = get_async_httpx_client(
            llm_provider=litellm.LlmProviders.JINA_AI
        )  # Use async client
        response = await client.post(
            "https://api.jina.ai/v1/rerank",
            headers={
                "accept": "application/json",
                "content-type": "application/json",
                "authorization": f"Bearer {api_key}",
            },
            json=request_data_dict,
        )
        if response.status_code != 200:
            raise Exception(response.text)
        _json_response = response.json()
        return JinaAIRerankConfig()._transform_response(_json_response)
    pass
--- a/litellm/llms/jina_ai/rerank/transformation.py
+++ b/litellm/llms/jina_ai/rerank/transformation.py
@ -7,30 +7,136 @@ Docs - https://jina.ai/reranker
 """
 import uuid
-from typing import List, Optional
+from typing import Any, Dict, List, Optional, Tuple, Union
 from httpx import URL, Response
 from litellm.llms.base_llm.chat.transformation import LiteLLMLoggingObj
 from litellm.llms.base_llm.rerank.transformation import BaseRerankConfig
 from litellm.types.rerank import (
    OptionalRerankParams,
    RerankBilledUnits,
    RerankResponse,
    RerankResponseMeta,
    RerankTokens,
 )
 from litellm.types.utils import ModelInfo
-class JinaAIRerankConfig:
+class JinaAIRerankConfig(BaseRerankConfig):
-    def _transform_response(self, response: dict) -> RerankResponse:
+    def get_supported_cohere_rerank_params(self, model: str) -> list:
        return [
            "query",
            "top_n",
            "documents",
            "return_documents",
        ]
-        _billed_units = RerankBilledUnits(**response.get("usage", {}))
+    def map_cohere_rerank_params(
-        _tokens = RerankTokens(**response.get("usage", {}))
+        self,
        non_default_params: dict,
        model: str,
        drop_params: bool,
        query: str,
        documents: List[Union[str, Dict[str, Any]]],
        custom_llm_provider: Optional[str] = None,
        top_n: Optional[int] = None,
        rank_fields: Optional[List[str]] = None,
        return_documents: Optional[bool] = True,
        max_chunks_per_doc: Optional[int] = None,
    ) -> OptionalRerankParams:
        optional_params = {}
        supported_params = self.get_supported_cohere_rerank_params(model)
        for k, v in non_default_params.items():
            if k in supported_params:
                optional_params[k] = v
        return OptionalRerankParams(
            **optional_params,
        )
    def get_complete_url(self, api_base: Optional[str], model: str) -> str:
        base_path = "/v1/rerank"
        if api_base is None:
            return "https://api.jina.ai/v1/rerank"
        base = URL(api_base)
        # Reconstruct URL with cleaned path
        cleaned_base = str(base.copy_with(path=base_path))
        return cleaned_base
    def transform_rerank_request(
        self, model: str, optional_rerank_params: OptionalRerankParams, headers: Dict
    ) -> Dict:
        return {"model": model, **optional_rerank_params}
    def transform_rerank_response(
        self,
        model: str,
        raw_response: Response,
        model_response: RerankResponse,
        logging_obj: LiteLLMLoggingObj,
        api_key: Optional[str] = None,
        request_data: Dict = {},
        optional_params: Dict = {},
        litellm_params: Dict = {},
    ) -> RerankResponse:
        if raw_response.status_code != 200:
            raise Exception(raw_response.text)
        logging_obj.post_call(original_response=raw_response.text)
        _json_response = raw_response.json()
        _billed_units = RerankBilledUnits(**_json_response.get("usage", {}))
        _tokens = RerankTokens(**_json_response.get("usage", {}))
        rerank_meta = RerankResponseMeta(billed_units=_billed_units, tokens=_tokens)
-        _results: Optional[List[dict]] = response.get("results")
+        _results: Optional[List[dict]] = _json_response.get("results")
        if _results is None:
-            raise ValueError(f"No results found in the response={response}")
+            raise ValueError(f"No results found in the response={_json_response}")
        return RerankResponse(
-            id=response.get("id") or str(uuid.uuid4()),
+            id=_json_response.get("id") or str(uuid.uuid4()),
            results=_results,  # type: ignore
            meta=rerank_meta,
        )  # Return response
    def validate_environment(
        self, headers: Dict, model: str, api_key: Optional[str] = None
    ) -> Dict:
        if api_key is None:
            raise ValueError(
                "api_key is required. Set via `api_key` parameter or `JINA_API_KEY` environment variable."
            )
        return {
            "accept": "application/json",
            "content-type": "application/json",
            "authorization": f"Bearer {api_key}",
        }
    def calculate_rerank_cost(
        self,
        model: str,
        custom_llm_provider: Optional[str] = None,
        billed_units: Optional[RerankBilledUnits] = None,
        model_info: Optional[ModelInfo] = None,
    ) -> Tuple[float, float]:
        """
        Jina AI reranker is priced at $0.000000018 per token.
        """
        if (
            model_info is None
            or "input_cost_per_token" not in model_info
            or model_info["input_cost_per_token"] is None
            or billed_units is None
        ):
            return 0.0, 0.0
        total_tokens = billed_units.get("total_tokens")
        if total_tokens is None:
            return 0.0, 0.0
        input_cost = model_info["input_cost_per_token"] * total_tokens
        return input_cost, 0.0
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -5982,6 +5982,19 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
    "amazon.rerank-v1:0": {
        "max_tokens": 32000,
        "max_input_tokens": 32000,
        "max_output_tokens": 32000,
        "max_query_tokens": 32000,
        "max_document_chunks_per_query": 100,
        "max_tokens_per_document_chunk": 512,
        "input_cost_per_token": 0.0,
        "input_cost_per_query": 0.001,
        "output_cost_per_token": 0.0,
        "litellm_provider": "bedrock",
        "mode": "rerank"
    },
    "amazon.titan-text-lite-v1": {
        "max_tokens": 4000, 
        "max_input_tokens": 42000,
@ -7022,6 +7035,19 @@
        "mode": "chat",
        "supports_tool_choice": true
    },
    "cohere.rerank-v3-5:0": {
        "max_tokens": 32000,
        "max_input_tokens": 32000,
        "max_output_tokens": 32000,
        "max_query_tokens": 32000,
        "max_document_chunks_per_query": 100,
        "max_tokens_per_document_chunk": 512,
        "input_cost_per_token": 0.0,
        "input_cost_per_query": 0.002,
        "output_cost_per_token": 0.0,
        "litellm_provider": "bedrock",
        "mode": "rerank"
    },
    "cohere.command-text-v14": {
        "max_tokens": 4096, 
        "max_input_tokens": 4096,
@ -9154,5 +9180,15 @@
        "input_cost_per_second": 0.00003333,
        "output_cost_per_second": 0.00, 
        "litellm_provider": "assemblyai"
    },
    "jina-reranker-v2-base-multilingual": {
        "max_tokens": 1024,
        "max_input_tokens": 1024,
        "max_output_tokens": 1024,
        "max_document_chunks_per_query": 2048,
        "input_cost_per_token": 0.000000018,
        "output_cost_per_token": 0.000000018,
        "litellm_provider": "jina_ai",
        "mode": "rerank"
    }
 }
--- a/litellm/rerank_api/main.py
+++ b/litellm/rerank_api/main.py
@ -9,7 +9,6 @@ from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging
 from litellm.llms.base_llm.rerank.transformation import BaseRerankConfig
 from litellm.llms.bedrock.rerank.handler import BedrockRerankHandler
 from litellm.llms.custom_httpx.llm_http_handler import BaseLLMHTTPHandler
 from litellm.llms.jina_ai.rerank.handler import JinaAIRerank
 from litellm.llms.together_ai.rerank.handler import TogetherAIRerank
 from litellm.rerank_api.rerank_utils import get_optional_rerank_params
 from litellm.secret_managers.main import get_secret, get_secret_str
@ -20,7 +19,6 @@ from litellm.utils import ProviderConfigManager, client, exception_type
 ####### ENVIRONMENT VARIABLES ###################
 # Initialize any necessary instances or variables here
 together_rerank = TogetherAIRerank()
 jina_ai_rerank = JinaAIRerank()
 bedrock_rerank = BedrockRerankHandler()
 base_llm_http_handler = BaseLLMHTTPHandler()
 #################################################
@ -264,16 +262,26 @@ def rerank(  # noqa: PLR0915
                raise ValueError(
                    "Jina AI API key is required, please set 'JINA_AI_API_KEY' in your environment"
                )
-            response = jina_ai_rerank.rerank(
+
            api_base = (
                dynamic_api_base
                or optional_params.api_base
                or litellm.api_base
                or get_secret("BEDROCK_API_BASE")  # type: ignore
            )
            response = base_llm_http_handler.rerank(
                model=model,
-                api_key=dynamic_api_key,
+                custom_llm_provider=_custom_llm_provider,
-                query=query,
+                optional_rerank_params=optional_rerank_params,
-                documents=documents,
+                logging_obj=litellm_logging_obj,
-                top_n=top_n,
+                timeout=optional_params.timeout,
-                rank_fields=rank_fields,
+                api_key=dynamic_api_key or optional_params.api_key,
-                return_documents=return_documents,
+                api_base=api_base,
                max_chunks_per_doc=max_chunks_per_doc,
                _is_async=_is_async,
                headers=headers or litellm.headers or {},
                client=client,
                model_response=model_response,
            )
        elif _custom_llm_provider == "bedrock":
            api_base = (
@ -295,6 +303,7 @@ def rerank(  # noqa: PLR0915
                optional_params=optional_params.model_dump(exclude_unset=True),
                api_base=api_base,
                logging_obj=litellm_logging_obj,
                client=client,
            )
        else:
            raise ValueError(f"Unsupported provider: {_custom_llm_provider}")
--- a/litellm/rerank_api/rerank_utils.py
+++ b/litellm/rerank_api/rerank_utils.py
@ -17,6 +17,17 @@ def get_optional_rerank_params(
    max_chunks_per_doc: Optional[int] = None,
    non_default_params: Optional[dict] = None,
 ) -> OptionalRerankParams:
    all_non_default_params = non_default_params or {}
    if query is not None:
        all_non_default_params["query"] = query
    if top_n is not None:
        all_non_default_params["top_n"] = top_n
    if documents is not None:
        all_non_default_params["documents"] = documents
    if return_documents is not None:
        all_non_default_params["return_documents"] = return_documents
    if max_chunks_per_doc is not None:
        all_non_default_params["max_chunks_per_doc"] = max_chunks_per_doc
    return rerank_provider_config.map_cohere_rerank_params(
        model=model,
        drop_params=drop_params,
@ -27,5 +38,5 @@ def get_optional_rerank_params(
        rank_fields=rank_fields,
        return_documents=return_documents,
        max_chunks_per_doc=max_chunks_per_doc,
-        non_default_params=non_default_params,
+        non_default_params=all_non_default_params,
    )
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -6198,6 +6198,8 @@ class ProviderConfigManager:
            return litellm.AzureAIRerankConfig()
        elif litellm.LlmProviders.INFINITY == provider:
            return litellm.InfinityRerankConfig()
        elif litellm.LlmProviders.JINA_AI == provider:
            return litellm.JinaAIRerankConfig()
        return litellm.CohereRerankConfig()
    @staticmethod
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -5982,6 +5982,19 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
    "amazon.rerank-v1:0": {
        "max_tokens": 32000,
        "max_input_tokens": 32000,
        "max_output_tokens": 32000,
        "max_query_tokens": 32000,
        "max_document_chunks_per_query": 100,
        "max_tokens_per_document_chunk": 512,
        "input_cost_per_token": 0.0,
        "input_cost_per_query": 0.001,
        "output_cost_per_token": 0.0,
        "litellm_provider": "bedrock",
        "mode": "rerank"
    },
    "amazon.titan-text-lite-v1": {
        "max_tokens": 4000, 
        "max_input_tokens": 42000,
@ -7022,6 +7035,19 @@
        "mode": "chat",
        "supports_tool_choice": true
    },
    "cohere.rerank-v3-5:0": {
        "max_tokens": 32000,
        "max_input_tokens": 32000,
        "max_output_tokens": 32000,
        "max_query_tokens": 32000,
        "max_document_chunks_per_query": 100,
        "max_tokens_per_document_chunk": 512,
        "input_cost_per_token": 0.0,
        "input_cost_per_query": 0.002,
        "output_cost_per_token": 0.0,
        "litellm_provider": "bedrock",
        "mode": "rerank"
    },
    "cohere.command-text-v14": {
        "max_tokens": 4096, 
        "max_input_tokens": 4096,
@ -9154,5 +9180,15 @@
        "input_cost_per_second": 0.00003333,
        "output_cost_per_second": 0.00, 
        "litellm_provider": "assemblyai"
    },
    "jina-reranker-v2-base-multilingual": {
        "max_tokens": 1024,
        "max_input_tokens": 1024,
        "max_output_tokens": 1024,
        "max_document_chunks_per_query": 2048,
        "input_cost_per_token": 0.000000018,
        "output_cost_per_token": 0.000000018,
        "litellm_provider": "jina_ai",
        "mode": "rerank"
    }
 }
--- a/tests/litellm/llms/bedrock/rerank/transformation.py
+++ b/tests/litellm/llms/bedrock/rerank/transformation.py
@ -0,0 +1,14 @@
 import json
 import os
 import sys
 import pytest
 from fastapi.testclient import TestClient
 sys.path.insert(
    0, os.path.abspath("../../..")
 )  # Adds the parent directory to the system path
 from unittest.mock import MagicMock, patch
 from litellm import rerank
 from litellm.llms.custom_httpx.http_handler import HTTPHandler
--- a/tests/litellm/rerank_api/test_main.py
+++ b/tests/litellm/rerank_api/test_main.py
@ -0,0 +1,51 @@
 import json
 import os
 import sys
 import pytest
 from fastapi.testclient import TestClient
 sys.path.insert(
    0, os.path.abspath("../../..")
 )  # Adds the parent directory to the system path
 from unittest.mock import MagicMock, patch
 from litellm import rerank
 from litellm.llms.custom_httpx.http_handler import HTTPHandler
 def test_rerank_infer_region_from_model_arn(monkeypatch):
    mock_response = MagicMock()
    monkeypatch.setenv("AWS_REGION_NAME", "us-east-1")
    args = {
        "model": "bedrock/arn:aws:bedrock:us-west-2::foundation-model/amazon.rerank-v1:0",
        "query": "hello",
        "documents": ["hello", "world"],
    }
    def return_val():
        return {
            "results": [
                {"index": 0, "relevanceScore": 0.6716859340667725},
                {"index": 1, "relevanceScore": 0.0004994205664843321},
            ]
        }
    mock_response.json = return_val
    mock_response.headers = {"key": "value"}
    mock_response.status_code = 200
    client = HTTPHandler()
    with patch.object(client, "post", return_value=mock_response) as mock_post:
        rerank(
            model=args["model"],
            query=args["query"],
            documents=args["documents"],
            client=client,
        )
        mock_post.assert_called_once()
        print(f"mock_post.call_args: {mock_post.call_args.kwargs}")
        assert "us-west-2" in mock_post.call_args.kwargs["url"]
        assert "us-east-1" not in mock_post.call_args.kwargs["url"]
--- a/tests/llm_translation/base_rerank_unit_tests.py
+++ b/tests/llm_translation/base_rerank_unit_tests.py
@ -33,6 +33,7 @@ def assert_response_shape(response, custom_llm_provider):
    expected_api_version_shape = {"version": str}
    expected_billed_units_shape = {"search_units": int}
    expected_billed_units_total_tokens_shape = {"total_tokens": int}
    assert isinstance(response.id, expected_response_shape["id"])
    assert isinstance(response.results, expected_response_shape["results"])
@ -52,9 +53,15 @@ def assert_response_shape(response, custom_llm_provider):
            response.meta["api_version"]["version"],
            expected_api_version_shape["version"],
        )
    assert isinstance(
        response.meta["billed_units"], expected_meta_shape["billed_units"]
    )
    if "total_tokens" in response.meta["billed_units"]:
        assert isinstance(
-            response.meta["billed_units"], expected_meta_shape["billed_units"]
+            response.meta["billed_units"]["total_tokens"],
            expected_billed_units_total_tokens_shape["total_tokens"],
        )
    else:
        assert isinstance(
            response.meta["billed_units"]["search_units"],
            expected_billed_units_shape["search_units"],
@ -79,7 +86,9 @@ class BaseLLMRerankTest(ABC):
    @pytest.mark.asyncio()
    @pytest.mark.parametrize("sync_mode", [True, False])
    async def test_basic_rerank(self, sync_mode):
-        litellm.set_verbose = True
+        litellm._turn_on_debug()
        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        rerank_call_args = self.get_base_rerank_call_args()
        custom_llm_provider = self.get_custom_llm_provider()
        if sync_mode is True:
@ -95,6 +104,9 @@ class BaseLLMRerankTest(ABC):
            assert response.id is not None
            assert response.results is not None
            assert response._hidden_params["response_cost"] is not None
            assert response._hidden_params["response_cost"] > 0
            assert_response_shape(
                response=response, custom_llm_provider=custom_llm_provider.value
            )
--- a/tests/llm_translation/test_azure_ai.py
+++ b/tests/llm_translation/test_azure_ai.py
@ -14,6 +14,7 @@ from litellm.llms.anthropic.chat import ModelResponseIterator
 import httpx
 import json
 from litellm.llms.custom_httpx.http_handler import HTTPHandler
 from base_rerank_unit_tests import BaseLLMRerankTest
 load_dotenv()
 import io
@ -185,6 +186,7 @@ def test_completion_azure_ai_command_r():
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_azure_deepseek_reasoning_content():
    import json
@ -192,37 +194,48 @@ def test_azure_deepseek_reasoning_content():
    with patch.object(client, "post") as mock_post:
        mock_response = MagicMock()
-        
+
        mock_response.text = json.dumps(
            {
                "choices": [
                    {
-                    "finish_reason": "stop",
+                        "finish_reason": "stop",
-                    "index": 0,
+                        "index": 0,
-                    "message": {
+                        "message": {
-                        "content": "<think>I am thinking here</think>\n\nThe sky is a canvas of blue",
+                            "content": "<think>I am thinking here</think>\n\nThe sky is a canvas of blue",
-                        "role": "assistant",
+                            "role": "assistant",
-                    }
+                        },
                    }
                ],
            }
-        ) 
+        )
-            
+
        mock_response.status_code = 200
        # Add required response attributes
        mock_response.headers = {"Content-Type": "application/json"}
        mock_response.json = lambda: json.loads(mock_response.text)
        mock_post.return_value = mock_response
        response = litellm.completion(
-            model='azure_ai/deepseek-r1',
+            model="azure_ai/deepseek-r1",
            messages=[{"role": "user", "content": "Hello, world!"}],
            api_base="https://litellm8397336933.services.ai.azure.com/models/chat/completions",
            api_key="my-fake-api-key",
-            client=client
+            client=client,
-        )  
+        )
        print(response)
-        assert(response.choices[0].message.reasoning_content == "I am thinking here")
+        assert response.choices[0].message.reasoning_content == "I am thinking here"
-        assert(response.choices[0].message.content == "\n\nThe sky is a canvas of blue")
+        assert response.choices[0].message.content == "\n\nThe sky is a canvas of blue"
 class TestAzureAIRerank(BaseLLMRerankTest):
    def get_custom_llm_provider(self) -> litellm.LlmProviders:
        return litellm.LlmProviders.AZURE_AI
    def get_base_rerank_call_args(self) -> dict:
        return {
            "model": "azure_ai/cohere-rerank-v3-english",
            "api_base": os.getenv("AZURE_AI_COHERE_API_BASE"),
            "api_key": os.getenv("AZURE_AI_COHERE_API_KEY"),
        }
--- a/tests/llm_translation/test_bedrock_completion.py
+++ b/tests/llm_translation/test_bedrock_completion.py
@ -2186,6 +2186,16 @@ class TestBedrockRerank(BaseLLMRerankTest):
        }
 class TestBedrockCohereRerank(BaseLLMRerankTest):
    def get_custom_llm_provider(self) -> litellm.LlmProviders:
        return litellm.LlmProviders.BEDROCK
    def get_base_rerank_call_args(self) -> dict:
        return {
            "model": "bedrock/arn:aws:bedrock:us-west-2::foundation-model/cohere.rerank-v3-5:0",
        }
@pytest.mark.parametrize(
    "messages, continue_message_index",
    [
--- a/tests/llm_translation/test_rerank.py
+++ b/tests/llm_translation/test_rerank.py
@ -66,6 +66,7 @@ def assert_response_shape(response, custom_llm_provider):
@pytest.mark.asyncio()
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.flaky(retries=3, delay=1)
 async def test_basic_rerank(sync_mode):
    litellm.set_verbose = True
    if sync_mode is True:
@ -311,6 +312,7 @@ def test_complete_base_url_cohere():
        (3, None, False),
    ],
 )
@pytest.mark.flaky(retries=3, delay=1)
 async def test_basic_rerank_caching(sync_mode, top_n_1, top_n_2, expect_cache_hit):
    from litellm.caching.caching import Cache
--- a/tests/local_testing/test_completion_cost.py
+++ b/tests/local_testing/test_completion_cost.py
@ -1574,7 +1574,11 @@ def test_completion_cost_azure_ai_rerank(model):
                "relevance_score": 0.990732,
            },
        ],
-        meta={},
+        meta={
            "billed_units": {
                "search_units": 1,
            }
        },
    )
    print("response", response)
    model = model