From 9b53ea4b0fdcc0446171ee3b2a54ef5ee57fa199 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 23 Nov 2023 16:08:44 -0800 Subject: [PATCH] (feat) proxy: cost tracking per completion request --- litellm/proxy/utils.py | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 348dd19f02..d1d9554996 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -1,7 +1,40 @@ -# import threading, time, litellm -# import concurrent.futures -# """ -# v1: +import litellm +from litellm import ModelResponse +from proxy_server import llm_model_list + +def track_cost_callback( + kwargs, # kwargs to completion + completion_response: ModelResponse = None, # response from completion + start_time = None, + end_time = None, # start/end time for completion +): + try: + # init logging config + print("in custom callback tracking cost", llm_model_list) + if "azure" in kwargs["model"]: + # for azure cost tracking, we check the provided model list in the config.yaml + # we need to map azure/chatgpt-deployment to -> azure/gpt-3.5-turbo + pass + # check if it has collected an entire stream response + if "complete_streaming_response" in kwargs: + # for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost + completion_response=kwargs["complete_streaming_response"] + input_text = kwargs["messages"] + output_text = completion_response["choices"][0]["message"]["content"] + response_cost = litellm.completion_cost( + model = kwargs["model"], + messages = input_text, + completion=output_text + ) + print("streaming response_cost", response_cost) + # for non streaming responses + else: + # we pass the completion_response obj + if kwargs["stream"] != True: + response_cost = litellm.completion_cost(completion_response=completion_response) + print("regular response_cost", response_cost) + except: + pass # 1. `--experimental_async` starts 2 background threads: # - 1. to check the redis queue: