diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 2b34acceb..9aef0304c 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -556,7 +556,7 @@ class PrismaClient: where={"token": token}, # type: ignore data={**db_data}, # type: ignore ) - print_verbose( + verbose_proxy_logger.debug( "\033[91m" + f"DB Token Table update succeeded {response}" + "\033[0m" diff --git a/litellm/utils.py b/litellm/utils.py index cca8bc85e..7a6b12a82 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2938,17 +2938,25 @@ def cost_per_token( ) return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar elif model_with_provider in model_cost_ref: - print_verbose(f"Looking up model={model_with_provider} in model_cost_map") + verbose_logger.debug( + f"Looking up model={model_with_provider} in model_cost_map" + ) + verbose_logger.debug( + f"applying cost={model_cost_ref[model_with_provider]['input_cost_per_token']} for prompt_tokens={prompt_tokens}" + ) prompt_tokens_cost_usd_dollar = ( model_cost_ref[model_with_provider]["input_cost_per_token"] * prompt_tokens ) + verbose_logger.debug( + f"applying cost={model_cost_ref[model_with_provider]['output_cost_per_token']} for completion_tokens={completion_tokens}" + ) completion_tokens_cost_usd_dollar = ( model_cost_ref[model_with_provider]["output_cost_per_token"] * completion_tokens ) return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar elif "ft:gpt-3.5-turbo" in model: - print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM") + verbose_logger.debug(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM") # fuzzy match ft:gpt-3.5-turbo:abcd-id-cool-litellm prompt_tokens_cost_usd_dollar = ( model_cost_ref["ft:gpt-3.5-turbo"]["input_cost_per_token"] * prompt_tokens @@ -2959,17 +2967,23 @@ def cost_per_token( ) return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar elif model in litellm.azure_llms: - print_verbose(f"Cost Tracking: {model} is an Azure LLM") + verbose_logger.debug(f"Cost Tracking: {model} is an Azure LLM") model = litellm.azure_llms[model] + verbose_logger.debug( + f"applying cost={model_cost_ref[model]['input_cost_per_token']} for prompt_tokens={prompt_tokens}" + ) prompt_tokens_cost_usd_dollar = ( model_cost_ref[model]["input_cost_per_token"] * prompt_tokens ) + verbose_logger.debug( + f"applying cost={model_cost_ref[model]['output_cost_per_token']} for completion_tokens={completion_tokens}" + ) completion_tokens_cost_usd_dollar = ( model_cost_ref[model]["output_cost_per_token"] * completion_tokens ) return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar elif model in litellm.azure_embedding_models: - print_verbose(f"Cost Tracking: {model} is an Azure Embedding Model") + verbose_logger.debug(f"Cost Tracking: {model} is an Azure Embedding Model") model = litellm.azure_embedding_models[model] prompt_tokens_cost_usd_dollar = ( model_cost_ref[model]["input_cost_per_token"] * prompt_tokens diff --git a/tests/test_keys.py b/tests/test_keys.py index a0bf7387d..917c50823 100644 --- a/tests/test_keys.py +++ b/tests/test_keys.py @@ -2,7 +2,7 @@ ## Tests /key endpoints. import pytest -import asyncio +import asyncio, time import aiohttp from openai import AsyncOpenAI import sys, os @@ -95,11 +95,10 @@ async def chat_completion(session, key, model="gpt-4"): async def chat_completion_streaming(session, key, model="gpt-4"): client = AsyncOpenAI(api_key=key, base_url="http://0.0.0.0:4000") messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Hello!"}, + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": f"Hello! {time.time()}"}, ] prompt_tokens = litellm.token_counter(model="gpt-35-turbo", messages=messages) - assert prompt_tokens == 19 data = { "model": model, "messages": messages, @@ -114,7 +113,7 @@ async def chat_completion_streaming(session, key, model="gpt-4"): print(f"content: {content}") completion_tokens = litellm.token_counter( - model="azure/gpt-35-turbo", text=content, count_response_tokens=True + model="gpt-35-turbo", text=content, count_response_tokens=True ) return prompt_tokens, completion_tokens @@ -251,7 +250,7 @@ async def test_key_info_spend_values(): ) print(f"prompt_tokens: {prompt_tokens}, completion_tokens: {completion_tokens}") prompt_cost, completion_cost = litellm.cost_per_token( - model="gpt-35-turbo", + model="azure/gpt-35-turbo", prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, )