diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 78e756a2a..af6d3fd3a 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -570,7 +570,7 @@ async def track_cost_callback( litellm_params = kwargs.get("litellm_params", {}) or {} proxy_server_request = litellm_params.get("proxy_server_request") or {} user_id = proxy_server_request.get("body", {}).get("user", None) - if "response_cost" in kwargs: + if kwargs.get("response_cost", None) is not None: response_cost = kwargs["response_cost"] user_api_key = kwargs["litellm_params"]["metadata"].get( "user_api_key", None @@ -596,9 +596,13 @@ async def track_cost_callback( end_time=end_time, ) else: - raise Exception( - f"Model not in litellm model cost map. Add custom pricing - https://docs.litellm.ai/docs/proxy/custom_pricing" - ) + if kwargs["stream"] != True or ( + kwargs["stream"] == True + and kwargs.get("complete_streaming_response") in kwargs + ): + raise Exception( + f"Model not in litellm model cost map. Add custom pricing - https://docs.litellm.ai/docs/proxy/custom_pricing" + ) except Exception as e: verbose_proxy_logger.debug(f"error in tracking cost callback - {str(e)}") diff --git a/litellm/utils.py b/litellm/utils.py index 00b76bfb5..762f94af4 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -1067,9 +1067,13 @@ class Logging: ## if model in model cost map - log the response cost ## else set cost to None verbose_logger.debug(f"Model={self.model}; result={result}") - if result is not None and ( - isinstance(result, ModelResponse) - or isinstance(result, EmbeddingResponse) + if ( + result is not None + and ( + isinstance(result, ModelResponse) + or isinstance(result, EmbeddingResponse) + ) + and self.stream != True ): try: self.model_call_details["response_cost"] = litellm.completion_cost( @@ -1104,6 +1108,12 @@ class Logging: self, result=None, start_time=None, end_time=None, cache_hit=None, **kwargs ): verbose_logger.debug(f"Logging Details LiteLLM-Success Call") + start_time, end_time, result = self._success_handler_helper_fn( + start_time=start_time, + end_time=end_time, + result=result, + cache_hit=cache_hit, + ) # print(f"original response in success handler: {self.model_call_details['original_response']}") try: verbose_logger.debug(f"success callbacks: {litellm.success_callback}") @@ -1119,6 +1129,8 @@ class Logging: complete_streaming_response = litellm.stream_chunk_builder( self.sync_streaming_chunks, messages=self.model_call_details.get("messages", None), + start_time=start_time, + end_time=end_time, ) except: complete_streaming_response = None @@ -1132,13 +1144,19 @@ class Logging: self.model_call_details[ "complete_streaming_response" ] = complete_streaming_response + try: + self.model_call_details["response_cost"] = litellm.completion_cost( + completion_response=complete_streaming_response, + ) + verbose_logger.debug( + f"Model={self.model}; cost={self.model_call_details['response_cost']}" + ) + except litellm.NotFoundError as e: + verbose_logger.debug( + f"Model={self.model} not found in completion cost map." + ) + self.model_call_details["response_cost"] = None - start_time, end_time, result = self._success_handler_helper_fn( - start_time=start_time, - end_time=end_time, - result=result, - cache_hit=cache_hit, - ) for callback in litellm.success_callback: try: if callback == "lite_debugger": @@ -1423,6 +1441,18 @@ class Logging: self.model_call_details[ "complete_streaming_response" ] = complete_streaming_response + try: + self.model_call_details["response_cost"] = litellm.completion_cost( + completion_response=complete_streaming_response, + ) + verbose_logger.debug( + f"Model={self.model}; cost={self.model_call_details['response_cost']}" + ) + except litellm.NotFoundError as e: + verbose_logger.debug( + f"Model={self.model} not found in completion cost map." + ) + self.model_call_details["response_cost"] = None for callback in litellm._async_success_callback: try: diff --git a/tests/test_keys.py b/tests/test_keys.py index f209f4c5a..f06b6721e 100644 --- a/tests/test_keys.py +++ b/tests/test_keys.py @@ -4,13 +4,20 @@ import pytest import asyncio import aiohttp +from openai import AsyncOpenAI +import sys, os + +sys.path.insert( + 0, os.path.abspath("../") +) # Adds the parent directory to the system path +import litellm async def generate_key(session, i): url = "http://0.0.0.0:4000/key/generate" headers = {"Authorization": "Bearer sk-1234", "Content-Type": "application/json"} data = { - "models": ["azure-models"], + "models": ["azure-models", "gpt-4"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": None, } @@ -82,6 +89,34 @@ async def chat_completion(session, key, model="gpt-4"): if status != 200: raise Exception(f"Request did not return a 200 status code: {status}") + return await response.json() + + +async def chat_completion_streaming(session, key, model="gpt-4"): + client = AsyncOpenAI(api_key=key, base_url="http://0.0.0.0:4000") + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello!"}, + ] + data = { + "model": model, + "messages": messages, + "stream": True, + } + response = await client.chat.completions.create(**data) + + content = "" + async for chunk in response: + content += chunk.choices[0].delta.content or "" + + print(f"content: {content}") + prompt_tokens = litellm.token_counter(model="azure/gpt-35-turbo", messages=messages) + completion_tokens = litellm.token_counter( + model="azure/gpt-35-turbo", text=content, count_response_tokens=True + ) + + return prompt_tokens, completion_tokens + @pytest.mark.asyncio async def test_key_update(): @@ -181,3 +216,49 @@ async def test_key_info(): random_key = key_gen["key"] status = await get_key_info(session=session, get_key=key, call_key=random_key) assert status == 403 + + +@pytest.mark.asyncio +async def test_key_info_spend_values(): + """ + - create key + - make completion call + - assert cost is expected value + """ + async with aiohttp.ClientSession() as session: + ## Test Spend Update ## + # completion + # response = await chat_completion(session=session, key=key) + # prompt_cost, completion_cost = litellm.cost_per_token( + # model="azure/gpt-35-turbo", + # prompt_tokens=response["usage"]["prompt_tokens"], + # completion_tokens=response["usage"]["completion_tokens"], + # ) + # response_cost = prompt_cost + completion_cost + # await asyncio.sleep(5) # allow db log to be updated + # key_info = await get_key_info(session=session, get_key=key, call_key=key) + # print( + # f"response_cost: {response_cost}; key_info spend: {key_info['info']['spend']}" + # ) + # assert response_cost == key_info["info"]["spend"] + ## streaming + key_gen = await generate_key(session=session, i=0) + new_key = key_gen["key"] + prompt_tokens, completion_tokens = await chat_completion_streaming( + session=session, key=new_key + ) + print(f"prompt_tokens: {prompt_tokens}, completion_tokens: {completion_tokens}") + prompt_cost, completion_cost = litellm.cost_per_token( + model="azure/gpt-35-turbo", + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + ) + response_cost = prompt_cost + completion_cost + await asyncio.sleep(5) # allow db log to be updated + key_info = await get_key_info( + session=session, get_key=new_key, call_key=new_key + ) + print( + f"response_cost: {response_cost}; key_info spend: {key_info['info']['spend']}" + ) + assert response_cost == key_info["info"]["spend"] diff --git a/tests/test_openai_endpoints.py b/tests/test_openai_endpoints.py index 5a91bffa7..67d7c4db9 100644 --- a/tests/test_openai_endpoints.py +++ b/tests/test_openai_endpoints.py @@ -68,6 +68,7 @@ async def chat_completion(session, key): if status != 200: raise Exception(f"Request did not return a 200 status code: {status}") + return await response.json() @pytest.mark.asyncio