(feat) proxy_server add a track_cost_callback for streaming

2023-10-10 11:33:08 -07:00 · 2023-10-10 11:33:08 -07:00 · 7496afdf64
commit 7496afdf64
parent 68b655df51
2 changed files with 37 additions and 26 deletions
--- a/litellm/proxy/cost.log
+++ b/litellm/proxy/cost.log
@ -1,4 +1,6 @@
-2023-10-09 14:46:28 - Model gpt-3.5-turbo-0613 Cost: 6.1e-05
-2023-10-09 14:46:29 - Model gpt-3.5-turbo Cost: 0.0
-2023-10-09 14:48:18 - Model gpt-3.5-turbo-0613 Cost: 0.00004700
-2023-10-09 14:48:18 - Model gpt-3.5-turbo Cost: 0.00000000
+2023-10-10 11:30:40 - Model gpt-3.5-turbo Cost: $0.00126900
+2023-10-10 11:31:02 - Model gpt-3.5-turbo Cost: $0.00131700
+2023-10-10 11:31:57 - Model gpt-3.5-turbo Cost: $0.00132450
+2023-10-10 11:32:04 - Model gpt-3.5-turbo Cost: $0.00148000
+2023-10-10 11:32:05 - Model gpt-3.5-turbo Cost: $0.00138800
+2023-10-10 11:32:14 - Model gpt-3.5-turbo Cost: $0.00160450
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -118,36 +118,45 @@ def data_generator(response):
        print_verbose(f"returned chunk: {chunk}")
        yield f"data: {json.dumps(chunk)}\n\n"

-def custom_callback(
+def track_cost_callback(
    kwargs,                 # kwargs to completion
    completion_response,    # response from completion
    start_time, end_time    # start/end time
 ):
-    # Your custom code here
-    print("LITELLM: in custom callback function")
-    # print("kwargs", kwargs)
-    # print("start_time", start_time)
-    # print("end_time", end_time)
-    if "complete_streaming_response" in kwargs:
-        print("GOT COMPLETE STREAMING RESPINSE", kwargs["complete_streaming_response"])
-        response_cost = litellm.completion_cost(
-            completion_response=kwargs["complete_streaming_response"]
+    try:
+        # init logging config
+        logging.basicConfig(
+                filename='cost.log',
+                level=logging.INFO,
+                format='%(asctime)s - %(message)s',
+                datefmt='%Y-%m-%d %H:%M:%S'
        )
-        print("response_cost", response_cost)
-    else:
-        print("completion_response", completion_response)
-        response_cost = litellm.completion_cost(completion_response=completion_response)

-    logging.basicConfig(
-            filename='cost.log',
-            level=logging.INFO,
-            format='%(asctime)s - %(message)s',
-            datefmt='%Y-%m-%d %H:%M:%S'
-    )
-    logging.info(f"Model {completion_response.model} Cost: ${response_cost:.8f}")
+        # check if it has collected an entire stream response
+        if "complete_streaming_response" in kwargs:
+            # for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost 
+            completion_response=kwargs["complete_streaming_response"]
+            input_text = kwargs["messages"]
+            output_text = completion_response["choices"][0]["message"]["content"]
+            response_cost = litellm.completion_cost(
+                model = kwargs["model"],
+                messages = input_text,
+                completion=output_text
+            )
+            print("streaming response_cost", response_cost)
+            logging.info(f"Model {kwargs['model']} Cost: ${response_cost:.8f}")

+        # for non streaming responses
+        else:
+            # we pass the completion_response obj
+            if kwargs["stream"] != True:
+                response_cost = litellm.completion_cost(completion_response=completion_response)
+                print("regular response_cost", response_cost)
+                logging.info(f"Model {completion_response.model} Cost: ${response_cost:.8f}")
+    except:
+        pass

-litellm.success_callback = [custom_callback]
+litellm.success_callback = [track_cost_callback]

 def litellm_completion(data, type): 
    try: