diff --git a/litellm/_logging.py b/litellm/_logging.py index e9a4a99cd..438fa9743 100644 --- a/litellm/_logging.py +++ b/litellm/_logging.py @@ -7,8 +7,11 @@ handler = logging.StreamHandler() handler.setLevel(logging.DEBUG) # Create a formatter and set it for the handler +formatter = logging.Formatter( + "\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(message)s", + datefmt="%H:%M:%S", +) -formatter = logging.Formatter("\033[92m%(name)s - %(levelname)s\033[0m: %(message)s") handler.setFormatter(formatter) diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py index 31e3f0d16..e381c93f7 100644 --- a/litellm/llms/ollama_chat.py +++ b/litellm/llms/ollama_chat.py @@ -220,8 +220,10 @@ def get_ollama_response( model_response["choices"][0]["message"] = response_json["message"] model_response["created"] = int(time.time()) model_response["model"] = "ollama/" + model - prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt))) # type: ignore - completion_tokens = response_json["eval_count"] + prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=messages)) # type: ignore + completion_tokens = response_json.get( + "eval_count", litellm.token_counter(text=response_json["message"]) + ) model_response["usage"] = litellm.Usage( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, @@ -320,8 +322,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj): model_response["choices"][0]["message"] = response_json["message"] model_response["created"] = int(time.time()) model_response["model"] = "ollama/" + data["model"] - prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt))) # type: ignore - completion_tokens = response_json["eval_count"] + prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"])) # type: ignore + completion_tokens = response_json.get( + "eval_count", litellm.token_counter(text=response_json["message"]) + ) model_response["usage"] = litellm.Usage( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, diff --git a/litellm/proxy/_types.py b/litellm/proxy/_types.py index 8a059c507..670cefcf2 100644 --- a/litellm/proxy/_types.py +++ b/litellm/proxy/_types.py @@ -343,8 +343,7 @@ class LiteLLM_SpendLogs(LiteLLMBase): endTime: Union[str, datetime, None] user: Optional[str] = "" modelParameters: Optional[Json] = {} - messages: Optional[Json] = [] - response: Optional[Json] = {} usage: Optional[Json] = {} metadata: Optional[Json] = {} cache_hit: Optional[str] = "False" + cache_key: Optional[str] = None diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 97168b19f..b06faac32 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -67,6 +67,8 @@ litellm_settings: general_settings: master_key: sk-1234 + alerting: ["slack"] + alerting_threshold: 10 # sends alerts if requests hang for 2 seconds # database_type: "dynamo_db" # database_args: { # 👈 all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190 # "billing_mode": "PAY_PER_REQUEST", diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 4a9be4632..e638d5d45 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -626,6 +626,12 @@ async def track_cost_callback( "user_api_key_user_id", None ) + if kwargs.get("cache_hit", False) == True: + response_cost = 0.0 + verbose_proxy_logger.info( + f"Cache Hit: response_cost {response_cost}, for user_id {user_id}" + ) + verbose_proxy_logger.info( f"response_cost {response_cost}, for user_id {user_id}" ) @@ -1429,8 +1435,6 @@ async def initialize( verbose_proxy_logger.setLevel( level=logging.DEBUG ) # set proxy logs to debug - litellm.set_verbose = True - dynamic_config = {"general": {}, user_model: {}} if config: ( @@ -1956,6 +1960,8 @@ async def chat_completion( else: # router is not set response = await litellm.acompletion(**data) + # Post Call Processing + data["litellm_status"] = "success" # used for alerting if hasattr(response, "_hidden_params"): model_id = response._hidden_params.get("model_id", None) or "" else: @@ -2141,6 +2147,7 @@ async def embeddings( response = await litellm.aembedding(**data) ### ALERTING ### + data["litellm_status"] = "success" # used for alerting end_time = time.time() asyncio.create_task( proxy_logging_obj.response_taking_too_long( @@ -2256,6 +2263,7 @@ async def image_generation( response = await litellm.aimage_generation(**data) ### ALERTING ### + data["litellm_status"] = "success" # used for alerting end_time = time.time() asyncio.create_task( proxy_logging_obj.response_taking_too_long( diff --git a/litellm/proxy/schema.prisma b/litellm/proxy/schema.prisma index 441c3515f..f06d42ba5 100644 --- a/litellm/proxy/schema.prisma +++ b/litellm/proxy/schema.prisma @@ -58,4 +58,5 @@ model LiteLLM_SpendLogs { usage Json @default("{}") metadata Json @default("{}") cache_hit String @default("") + cache_key String @default("") } \ No newline at end of file diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index adc5fa486..0debcb235 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -97,7 +97,7 @@ class ProxyLogging: 3. /image/generation """ ### ALERTING ### - asyncio.create_task(self.response_taking_too_long()) + asyncio.create_task(self.response_taking_too_long(request_data=data)) try: for callback in litellm.callbacks: @@ -137,24 +137,47 @@ class ProxyLogging: start_time: Optional[float] = None, end_time: Optional[float] = None, type: Literal["hanging_request", "slow_response"] = "hanging_request", + request_data: Optional[dict] = None, ): + if request_data is not None: + model = request_data.get("model", "") + messages = request_data.get("messages", "") + # try casting messages to str and get the first 100 characters, else mark as None + try: + messages = str(messages) + messages = messages[:10000] + except: + messages = None + + request_info = f"\nRequest Model: {model}\nMessages: {messages}" + else: + request_info = "" + if type == "hanging_request": # Simulate a long-running operation that could take more than 5 minutes await asyncio.sleep( self.alerting_threshold ) # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests - - await self.alerting_handler( - message=f"Requests are hanging - {self.alerting_threshold}s+ request time", - level="Medium", - ) + if ( + request_data is not None + and request_data.get("litellm_status", "") != "success" + ): + # only alert hanging responses if they have not been marked as success + alerting_message = ( + f"Requests are hanging - {self.alerting_threshold}s+ request time" + ) + await self.alerting_handler( + message=alerting_message + request_info, + level="Medium", + ) elif ( type == "slow_response" and start_time is not None and end_time is not None ): + slow_message = f"Responses are slow - {round(end_time-start_time,2)}s response time > Alerting threshold: {self.alerting_threshold}s" if end_time - start_time > self.alerting_threshold: await self.alerting_handler( - message=f"Responses are slow - {round(end_time-start_time,2)}s response time", + message=slow_message + request_info, level="Low", ) @@ -173,7 +196,13 @@ class ProxyLogging: level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'. message: str - what is the alert about """ - formatted_message = f"Level: {level}\n\nMessage: {message}" + from datetime import datetime + + # Get the current timestamp + current_time = datetime.now().strftime("%H:%M:%S") + formatted_message = ( + f"Level: {level}\nTimestamp: {current_time}\n\nMessage: {message}" + ) if self.alerting is None: return @@ -184,7 +213,9 @@ class ProxyLogging: raise Exception("Missing SLACK_WEBHOOK_URL from environment") payload = {"text": formatted_message} headers = {"Content-type": "application/json"} - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession( + connector=aiohttp.TCPConnector(ssl=False) + ) as session: async with session.post( slack_webhook_url, json=payload, headers=headers ) as response: @@ -972,11 +1003,18 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time): if api_key is not None and isinstance(api_key, str) and api_key.startswith("sk-"): # hash the api_key api_key = hash_token(api_key) - if "headers" in metadata and "authorization" in metadata["headers"]: metadata["headers"].pop( "authorization" ) # do not store the original `sk-..` api key in the db + if litellm.cache is not None: + cache_key = litellm.cache.get_cache_key(**kwargs) + else: + cache_key = "Cache OFF" + if cache_hit == True: + import time + + id = f"{id}_cache_hit{time.time()}" # SpendLogs does not allow duplicate request_id payload = { "request_id": id, @@ -990,6 +1028,7 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time): "modelParameters": optional_params, "usage": usage, "metadata": metadata, + "cache_key": cache_key, } json_fields = [ diff --git a/litellm/tests/test_key_generate_prisma.py b/litellm/tests/test_key_generate_prisma.py index 5231f93d3..27392e0f0 100644 --- a/litellm/tests/test_key_generate_prisma.py +++ b/litellm/tests/test_key_generate_prisma.py @@ -888,6 +888,9 @@ def test_call_with_key_over_budget(prisma_client): # update spend using track_cost callback, make 2nd request, it should fail from litellm.proxy.proxy_server import track_cost_callback from litellm import ModelResponse, Choices, Message, Usage + from litellm.caching import Cache + + litellm.cache = Cache() import time request_id = f"chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac{time.time()}" @@ -935,6 +938,10 @@ def test_call_with_key_over_budget(prisma_client): assert spend_log.request_id == request_id assert spend_log.spend == float("2e-05") assert spend_log.model == "chatgpt-v-2" + assert ( + spend_log.cache_key + == "a61ae14fe4a8b8014a61e6ae01a100c8bc6770ac37c293242afed954bc69207d" + ) # use generated key to auth in result = await user_api_key_auth(request=request, api_key=bearer_token) @@ -948,6 +955,76 @@ def test_call_with_key_over_budget(prisma_client): print(vars(e)) +@pytest.mark.asyncio() +async def test_call_with_key_never_over_budget(prisma_client): + # Make a call with a key with budget=None, it should never fail + setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client) + setattr(litellm.proxy.proxy_server, "master_key", "sk-1234") + try: + await litellm.proxy.proxy_server.prisma_client.connect() + request = GenerateKeyRequest(max_budget=None) + key = await generate_key_fn(request) + print(key) + + generated_key = key.key + user_id = key.user_id + bearer_token = "Bearer " + generated_key + + request = Request(scope={"type": "http"}) + request._url = URL(url="/chat/completions") + + # use generated key to auth in + result = await user_api_key_auth(request=request, api_key=bearer_token) + print("result from user auth with new key", result) + + # update spend using track_cost callback, make 2nd request, it should fail + from litellm.proxy.proxy_server import track_cost_callback + from litellm import ModelResponse, Choices, Message, Usage + import time + + request_id = f"chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac{time.time()}" + + resp = ModelResponse( + id=request_id, + choices=[ + Choices( + finish_reason=None, + index=0, + message=Message( + content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a", + role="assistant", + ), + ) + ], + model="gpt-35-turbo", # azure always has model written like this + usage=Usage( + prompt_tokens=210000, completion_tokens=200000, total_tokens=41000 + ), + ) + await track_cost_callback( + kwargs={ + "model": "chatgpt-v-2", + "stream": False, + "litellm_params": { + "metadata": { + "user_api_key": generated_key, + "user_api_key_user_id": user_id, + } + }, + "response_cost": 200000, + }, + completion_response=resp, + start_time=datetime.now(), + end_time=datetime.now(), + ) + + # use generated key to auth in + result = await user_api_key_auth(request=request, api_key=bearer_token) + print("result from user auth with new key", result) + except Exception as e: + pytest.fail(f"This should have not failed!. They key uses max_budget=None. {e}") + + @pytest.mark.asyncio() async def test_call_with_key_over_budget_stream(prisma_client): # 14. Make a call with a key over budget, expect to fail diff --git a/litellm/utils.py b/litellm/utils.py index b41a554d7..0e12463b9 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2877,8 +2877,13 @@ def token_counter( print_verbose( f"Token Counter - using generic token counter, for model={model}" ) - enc = tokenizer_json["tokenizer"].encode(text) - num_tokens = len(enc) + num_tokens = openai_token_counter( + text=text, # type: ignore + model="gpt-3.5-turbo", + messages=messages, + is_tool_call=is_tool_call, + count_response_tokens=count_response_tokens, + ) else: num_tokens = len(encoding.encode(text)) # type: ignore return num_tokens diff --git a/pyproject.toml b/pyproject.toml index f8d15f17d..fb8507d8c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.19.0" +version = "1.19.1" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -63,7 +63,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.19.0" +version = "1.19.1" version_files = [ "pyproject.toml:^version" ] diff --git a/schema.prisma b/schema.prisma index 441c3515f..72d14e13b 100644 --- a/schema.prisma +++ b/schema.prisma @@ -7,6 +7,7 @@ generator client { provider = "prisma-client-py" } +// Track spend, rate limit, budget Users model LiteLLM_UserTable { user_id String @unique team_id String? @@ -21,7 +22,7 @@ model LiteLLM_UserTable { budget_reset_at DateTime? } -// required for token gen +// Generate Tokens for Proxy model LiteLLM_VerificationToken { token String @unique spend Float @default(0.0) @@ -40,11 +41,13 @@ model LiteLLM_VerificationToken { budget_reset_at DateTime? } +// store proxy config.yaml model LiteLLM_Config { param_name String @id param_value Json? } +// View spend, model, api_key per request model LiteLLM_SpendLogs { request_id String @unique call_type String @@ -58,4 +61,5 @@ model LiteLLM_SpendLogs { usage Json @default("{}") metadata Json @default("{}") cache_hit String @default("") + cache_key String @default("") } \ No newline at end of file