Merge pull request #5573 from BerriAI/litellm_add_rerank_spend_tracking

[Feat] Add cost tracking for cohere rerank
This commit is contained in:
Ishaan Jaff 2024-09-06 18:11:30 -07:00 committed by GitHub
commit 64a828b455
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 98 additions and 1 deletions

View file

@ -22,6 +22,7 @@ from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_cha
from litellm.llms.anthropic.cost_calculation import ( from litellm.llms.anthropic.cost_calculation import (
cost_per_token as anthropic_cost_per_token, cost_per_token as anthropic_cost_per_token,
) )
from litellm.rerank_api.types import RerankResponse
from litellm.types.llms.openai import HttpxBinaryResponseContent from litellm.types.llms.openai import HttpxBinaryResponseContent
from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
from litellm.types.utils import PassthroughCallTypes, Usage from litellm.types.utils import PassthroughCallTypes, Usage
@ -93,6 +94,8 @@ def cost_per_token(
"transcription", "transcription",
"aspeech", "aspeech",
"speech", "speech",
"rerank",
"arerank",
] = "completion", ] = "completion",
) -> Tuple[float, float]: ) -> Tuple[float, float]:
""" """
@ -487,6 +490,8 @@ def completion_cost(
"transcription", "transcription",
"aspeech", "aspeech",
"speech", "speech",
"rerank",
"arerank",
] = "completion", ] = "completion",
### REGION ### ### REGION ###
custom_llm_provider=None, custom_llm_provider=None,
@ -747,6 +752,7 @@ def response_cost_calculator(
TranscriptionResponse, TranscriptionResponse,
TextCompletionResponse, TextCompletionResponse,
HttpxBinaryResponseContent, HttpxBinaryResponseContent,
RerankResponse,
], ],
model: str, model: str,
custom_llm_provider: Optional[str], custom_llm_provider: Optional[str],
@ -765,6 +771,8 @@ def response_cost_calculator(
"transcription", "transcription",
"aspeech", "aspeech",
"speech", "speech",
"rerank",
"arerank",
], ],
optional_params: dict, optional_params: dict,
cache_hit: Optional[bool] = None, cache_hit: Optional[bool] = None,
@ -789,6 +797,15 @@ def response_cost_calculator(
call_type=call_type, call_type=call_type,
custom_llm_provider=custom_llm_provider, custom_llm_provider=custom_llm_provider,
) )
elif isinstance(response_object, RerankResponse) and (
call_type == "arerank" or call_type == "rerank"
):
response_cost = rerank_cost(
rerank_response=response_object,
model=model,
call_type=call_type,
custom_llm_provider=custom_llm_provider,
)
else: else:
if custom_pricing is True: # override defaults if custom pricing is set if custom_pricing is True: # override defaults if custom pricing is set
base_model = model base_model = model
@ -820,3 +837,28 @@ def response_cost_calculator(
) )
) )
return None return None
def rerank_cost(
rerank_response: RerankResponse,
model: str,
call_type: Literal["rerank", "arerank"],
custom_llm_provider: Optional[str],
) -> float:
"""
Returns
- float or None: cost of response OR none if error.
"""
_, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
try:
if custom_llm_provider == "cohere":
return 0.002
raise ValueError(
f"invalid custom_llm_provider for rerank model: {model}, custom_llm_provider: {custom_llm_provider}"
)
except Exception as e:
verbose_logger.exception(
f"litellm.cost_calculator.py::rerank_cost - Exception occurred - {str(e)}"
)
raise e

View file

@ -30,6 +30,7 @@ from litellm.integrations.custom_logger import CustomLogger
from litellm.litellm_core_utils.redact_messages import ( from litellm.litellm_core_utils.redact_messages import (
redact_message_input_output_from_logging, redact_message_input_output_from_logging,
) )
from litellm.rerank_api.types import RerankResponse
from litellm.types.llms.openai import HttpxBinaryResponseContent from litellm.types.llms.openai import HttpxBinaryResponseContent
from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
from litellm.types.utils import ( from litellm.types.utils import (
@ -525,6 +526,7 @@ class Logging:
TranscriptionResponse, TranscriptionResponse,
TextCompletionResponse, TextCompletionResponse,
HttpxBinaryResponseContent, HttpxBinaryResponseContent,
RerankResponse,
], ],
cache_hit: Optional[bool] = None, cache_hit: Optional[bool] = None,
): ):
@ -588,6 +590,7 @@ class Logging:
or isinstance(result, TranscriptionResponse) or isinstance(result, TranscriptionResponse)
or isinstance(result, TextCompletionResponse) or isinstance(result, TextCompletionResponse)
or isinstance(result, HttpxBinaryResponseContent) # tts or isinstance(result, HttpxBinaryResponseContent) # tts
or isinstance(result, RerankResponse)
): ):
## RESPONSE COST ## ## RESPONSE COST ##
self.model_call_details["response_cost"] = ( self.model_call_details["response_cost"] = (

View file

@ -9,7 +9,7 @@ from litellm.llms.cohere.rerank import CohereRerank
from litellm.llms.togetherai.rerank import TogetherAIRerank from litellm.llms.togetherai.rerank import TogetherAIRerank
from litellm.secret_managers.main import get_secret from litellm.secret_managers.main import get_secret
from litellm.types.router import * from litellm.types.router import *
from litellm.utils import supports_httpx_timeout from litellm.utils import client, supports_httpx_timeout
from .types import RerankRequest, RerankResponse from .types import RerankRequest, RerankResponse
@ -20,6 +20,7 @@ together_rerank = TogetherAIRerank()
################################################# #################################################
@client
async def arerank( async def arerank(
model: str, model: str,
query: str, query: str,
@ -64,6 +65,7 @@ async def arerank(
raise e raise e
@client
def rerank( def rerank(
model: str, model: str,
query: str, query: str,

View file

@ -23,3 +23,16 @@ class RerankResponse(BaseModel):
id: str id: str
results: List[dict] # Contains index and relevance_score results: List[dict] # Contains index and relevance_score
meta: dict # Contains api_version and billed_units meta: dict # Contains api_version and billed_units
_hidden_params: dict = {}
class Config:
underscore_attrs_are_private = True
def __getitem__(self, key):
return self.__dict__[key]
def get(self, key, default=None):
return self.__dict__.get(key, default)
def __contains__(self, key):
return key in self.__dict__

View file

@ -1,3 +1,4 @@
import asyncio
import json import json
import os import os
import sys import sys
@ -20,6 +21,7 @@ import pytest
import litellm import litellm
from litellm import RateLimitError, Timeout, completion, completion_cost, embedding from litellm import RateLimitError, Timeout, completion, completion_cost, embedding
from litellm.integrations.custom_logger import CustomLogger
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
@ -177,3 +179,37 @@ async def test_rerank_custom_api_base():
assert response.results is not None assert response.results is not None
assert_response_shape(response, custom_llm_provider="cohere") assert_response_shape(response, custom_llm_provider="cohere")
class TestLogger(CustomLogger):
def __init__(self):
self.kwargs = None
self.response_obj = None
super().__init__()
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
print("in success event for rerank, kwargs = ", kwargs)
print("in success event for rerank, response_obj = ", response_obj)
self.kwargs = kwargs
self.response_obj = response_obj
@pytest.mark.asyncio()
async def test_rerank_custom_callbacks():
custom_logger = TestLogger()
litellm.callbacks = [custom_logger]
response = await litellm.arerank(
model="cohere/rerank-english-v3.0",
query="hello",
documents=["hello", "world"],
top_n=3,
)
await asyncio.sleep(5)
print("async re rank response: ", response)
assert custom_logger.kwargs is not None
assert custom_logger.kwargs.get("response_cost") > 0.0
assert custom_logger.response_obj is not None
assert custom_logger.response_obj.results is not None

View file

@ -745,6 +745,7 @@ def client(original_function):
or kwargs.get("amoderation", False) == True or kwargs.get("amoderation", False) == True
or kwargs.get("atext_completion", False) == True or kwargs.get("atext_completion", False) == True
or kwargs.get("atranscription", False) == True or kwargs.get("atranscription", False) == True
or kwargs.get("arerank", False) == True
): ):
# [OPTIONAL] CHECK MAX RETRIES / REQUEST # [OPTIONAL] CHECK MAX RETRIES / REQUEST
if litellm.num_retries_per_request is not None: if litellm.num_retries_per_request is not None: