From 9b53ea4b0fdcc0446171ee3b2a54ef5ee57fa199 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 23 Nov 2023 16:08:44 -0800
Subject: [PATCH] (feat) proxy: cost tracking per completion request

---
 litellm/proxy/utils.py | 41 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index 348dd19f02..d1d9554996 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -1,7 +1,40 @@
-# import threading, time, litellm
-# import concurrent.futures
-# """
-# v1: 
+import litellm
+from litellm import ModelResponse
+from proxy_server import llm_model_list
+
+def track_cost_callback(
+    kwargs,                                       # kwargs to completion
+    completion_response: ModelResponse = None,    # response from completion
+    start_time = None,
+    end_time = None,                              # start/end time for completion
+):
+    try:
+        # init logging config
+        print("in custom callback tracking cost", llm_model_list)
+        if "azure" in kwargs["model"]:
+            # for azure cost tracking, we check the provided model list in the config.yaml
+            # we need to map azure/chatgpt-deployment to -> azure/gpt-3.5-turbo
+            pass
+        # check if it has collected an entire stream response
+        if "complete_streaming_response" in kwargs:
+            # for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost 
+            completion_response=kwargs["complete_streaming_response"]
+            input_text = kwargs["messages"]
+            output_text = completion_response["choices"][0]["message"]["content"]
+            response_cost = litellm.completion_cost(
+                model = kwargs["model"],
+                messages = input_text,
+                completion=output_text
+            )
+            print("streaming response_cost", response_cost)
+        # for non streaming responses
+        else:
+            # we pass the completion_response obj
+            if kwargs["stream"] != True:
+                response_cost = litellm.completion_cost(completion_response=completion_response)
+                print("regular response_cost", response_cost)
+    except:
+        pass
 
 # 1. `--experimental_async` starts 2 background threads:
 #     - 1. to check the redis queue: