From 087bd5e26773da2c73f34875b22e22c83ad40023 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Wed, 24 Jan 2024 14:55:21 -0800 Subject: [PATCH 01/15] (feat) slack alerting - log request/response --- litellm/proxy/utils.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 978355568..222a21592 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -97,7 +97,7 @@ class ProxyLogging: 3. /image/generation """ ### ALERTING ### - asyncio.create_task(self.response_taking_too_long()) + asyncio.create_task(self.response_taking_too_long(request_data=data)) try: for callback in litellm.callbacks: @@ -137,6 +137,8 @@ class ProxyLogging: start_time: Optional[float] = None, end_time: Optional[float] = None, type: Literal["hanging_request", "slow_response"] = "hanging_request", + request_data: Optional[dict] = None, + response_obj: Optional[litellm.ModelResponse] = None, ): if type == "hanging_request": # Simulate a long-running operation that could take more than 5 minutes @@ -144,8 +146,12 @@ class ProxyLogging: self.alerting_threshold ) # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests + alerting_message = ( + f"Requests are hanging - {self.alerting_threshold}s+ request time" + ) await self.alerting_handler( - message=f"Requests are hanging - {self.alerting_threshold}s+ request time", + message=alerting_message + + f"\nRequest: {request_data}\nResponse: {response_obj}", level="Medium", ) @@ -184,7 +190,9 @@ class ProxyLogging: raise Exception("Missing SLACK_WEBHOOK_URL from environment") payload = {"text": formatted_message} headers = {"Content-type": "application/json"} - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession( + connector=aiohttp.TCPConnector(ssl=False) + ) as session: async with session.post( slack_webhook_url, json=payload, headers=headers ) as response: From 47110180c8c26cc786fc83efe50af2607991733c Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Wed, 24 Jan 2024 15:16:18 -0800 Subject: [PATCH 02/15] (feat) proxy - add timestamp to debug logs --- litellm/_logging.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/litellm/_logging.py b/litellm/_logging.py index e9a4a99cd..d06d8cb6f 100644 --- a/litellm/_logging.py +++ b/litellm/_logging.py @@ -7,8 +7,11 @@ handler = logging.StreamHandler() handler.setLevel(logging.DEBUG) # Create a formatter and set it for the handler +formatter = logging.Formatter( + "\033[92m%(asctime)s - %(name)s - %(levelname)s\033[0m: %(message)s", + datefmt="%H:%M:%S", +) -formatter = logging.Formatter("\033[92m%(name)s - %(levelname)s\033[0m: %(message)s") handler.setFormatter(formatter) From 8f4e256531377b72515d5a1b9b4ef7ba9c46483b Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Wed, 24 Jan 2024 15:17:33 -0800 Subject: [PATCH 03/15] (feat) add request_info to slack alerts --- litellm/proxy/utils.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 222a21592..052095497 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -138,8 +138,20 @@ class ProxyLogging: end_time: Optional[float] = None, type: Literal["hanging_request", "slow_response"] = "hanging_request", request_data: Optional[dict] = None, - response_obj: Optional[litellm.ModelResponse] = None, ): + if request_data is not None: + model = request_data.get("model", "") + messages = request_data.get("messages", "") + # try casting messages to str and get the first 100 characters, else mark as None + try: + messages = str(messages) + messages = messages[:10000] + except: + messages = None + + request_info = f"\nRequest Model: {model}\nMessages: {messages}" + else: + request_info = "" if type == "hanging_request": # Simulate a long-running operation that could take more than 5 minutes await asyncio.sleep( @@ -150,17 +162,19 @@ class ProxyLogging: f"Requests are hanging - {self.alerting_threshold}s+ request time" ) await self.alerting_handler( - message=alerting_message - + f"\nRequest: {request_data}\nResponse: {response_obj}", + message=alerting_message + request_info, level="Medium", ) elif ( type == "slow_response" and start_time is not None and end_time is not None ): + slow_message = ( + f"Responses are slow - {round(end_time-start_time,2)}s response time" + ) if end_time - start_time > self.alerting_threshold: await self.alerting_handler( - message=f"Responses are slow - {round(end_time-start_time,2)}s response time", + message=slow_message + request_info, level="Low", ) From 6c13776701144e3f0ab058c5d9295fe3b15e6828 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Wed, 24 Jan 2024 15:25:40 -0800 Subject: [PATCH 04/15] (fix) alerting - show timestamps in alert --- litellm/_logging.py | 2 +- litellm/proxy/utils.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/litellm/_logging.py b/litellm/_logging.py index d06d8cb6f..438fa9743 100644 --- a/litellm/_logging.py +++ b/litellm/_logging.py @@ -8,7 +8,7 @@ handler.setLevel(logging.DEBUG) # Create a formatter and set it for the handler formatter = logging.Formatter( - "\033[92m%(asctime)s - %(name)s - %(levelname)s\033[0m: %(message)s", + "\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(message)s", datefmt="%H:%M:%S", ) diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 052095497..ebc2dbc05 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -152,6 +152,7 @@ class ProxyLogging: request_info = f"\nRequest Model: {model}\nMessages: {messages}" else: request_info = "" + if type == "hanging_request": # Simulate a long-running operation that could take more than 5 minutes await asyncio.sleep( @@ -193,7 +194,13 @@ class ProxyLogging: level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'. message: str - what is the alert about """ - formatted_message = f"Level: {level}\n\nMessage: {message}" + from datetime import datetime + + # Get the current timestamp + current_time = datetime.now().strftime("%H:%M:%S") + formatted_message = ( + f"Level: {level}\nTimestamp: {current_time}\n\nMessage: {message}" + ) if self.alerting is None: return From b993c62144e992d2f084717cb5fa2271684b0063 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Wed, 24 Jan 2024 15:58:07 -0800 Subject: [PATCH 05/15] (fix) only alert users when requests are hanging --- litellm/proxy/proxy_server.py | 4 ++++ litellm/proxy/utils.py | 19 +++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index ca58371f4..d8365404c 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -1863,6 +1863,8 @@ async def chat_completion( else: # router is not set response = await litellm.acompletion(**data) + # Post Call Processing + data["litellm_status"] = "success" # used for alerting if hasattr(response, "_hidden_params"): model_id = response._hidden_params.get("model_id", None) or "" else: @@ -2048,6 +2050,7 @@ async def embeddings( response = await litellm.aembedding(**data) ### ALERTING ### + data["litellm_status"] = "success" # used for alerting end_time = time.time() asyncio.create_task( proxy_logging_obj.response_taking_too_long( @@ -2163,6 +2166,7 @@ async def image_generation( response = await litellm.aimage_generation(**data) ### ALERTING ### + data["litellm_status"] = "success" # used for alerting end_time = time.time() asyncio.create_task( proxy_logging_obj.response_taking_too_long( diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index ebc2dbc05..d638d162d 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -158,14 +158,17 @@ class ProxyLogging: await asyncio.sleep( self.alerting_threshold ) # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests - - alerting_message = ( - f"Requests are hanging - {self.alerting_threshold}s+ request time" - ) - await self.alerting_handler( - message=alerting_message + request_info, - level="Medium", - ) + if ( + request_data is not None + and request_data.get("litellm_status", "") != "success" + ): + alerting_message = ( + f"Requests are hanging - {self.alerting_threshold}s+ request time" + ) + await self.alerting_handler( + message=alerting_message + request_info, + level="Medium", + ) elif ( type == "slow_response" and start_time is not None and end_time is not None From 9aae60f1626a99dac5bfdebbbe1463f60e5b8463 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Wed, 24 Jan 2024 16:07:46 -0800 Subject: [PATCH 06/15] (FIX) improve slack alerting messages --- litellm/proxy/proxy_config.yaml | 2 ++ litellm/proxy/utils.py | 5 ++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 97168b19f..b06faac32 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -67,6 +67,8 @@ litellm_settings: general_settings: master_key: sk-1234 + alerting: ["slack"] + alerting_threshold: 10 # sends alerts if requests hang for 2 seconds # database_type: "dynamo_db" # database_args: { # 👈 all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190 # "billing_mode": "PAY_PER_REQUEST", diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index d638d162d..94e86600a 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -162,6 +162,7 @@ class ProxyLogging: request_data is not None and request_data.get("litellm_status", "") != "success" ): + # only alert hanging responses if they have not been marked as success alerting_message = ( f"Requests are hanging - {self.alerting_threshold}s+ request time" ) @@ -173,9 +174,7 @@ class ProxyLogging: elif ( type == "slow_response" and start_time is not None and end_time is not None ): - slow_message = ( - f"Responses are slow - {round(end_time-start_time,2)}s response time" - ) + slow_message = f"Responses are slow - {round(end_time-start_time,2)}s response time > Alerting threshold: {self.alerting_threshold}s" if end_time - start_time > self.alerting_threshold: await self.alerting_handler( message=slow_message + request_info, From 3305dc75ca4081315208aa304f71e4273aacb807 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Wed, 24 Jan 2024 17:15:01 -0800 Subject: [PATCH 07/15] (docs) add comments on prisma.schema --- schema.prisma | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/schema.prisma b/schema.prisma index 441c3515f..0dd11eb64 100644 --- a/schema.prisma +++ b/schema.prisma @@ -7,6 +7,7 @@ generator client { provider = "prisma-client-py" } +// Track spend, rate limit, budget Users model LiteLLM_UserTable { user_id String @unique team_id String? @@ -21,7 +22,7 @@ model LiteLLM_UserTable { budget_reset_at DateTime? } -// required for token gen +// Generate Tokens for Proxy model LiteLLM_VerificationToken { token String @unique spend Float @default(0.0) @@ -40,11 +41,13 @@ model LiteLLM_VerificationToken { budget_reset_at DateTime? } +// store proxy config.yaml model LiteLLM_Config { param_name String @id param_value Json? } +// View spend, model, api_key per request model LiteLLM_SpendLogs { request_id String @unique call_type String From d6949937030d2d6ea7f04b2fcc1996d30bf917f5 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Wed, 24 Jan 2024 17:34:17 -0800 Subject: [PATCH 08/15] (fix) bug from bb7705b4945b28be5cdc4a382de0ce116e24b621 --- litellm/proxy/utils.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 39a2f91e4..15f230a6a 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -473,11 +473,10 @@ class PrismaClient: "budget_reset_at": {"lt": reset_at}, } ) - return response - elif table_name == "user" and query_type == "find_all": - response = await self.db.litellm_usertable.find_many( # type: ignore - order={"spend": "desc"}, - ) + elif query_type == "find_all": + response = await self.db.litellm_usertable.find_many( # type: ignore + order={"spend": "desc"}, + ) return response elif table_name == "spend": verbose_proxy_logger.debug( From 2130a61b6eabbb22f1013e71e724ff2ebedd97e8 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Wed, 24 Jan 2024 17:56:00 -0800 Subject: [PATCH 09/15] (feat) add cache_key in spend_log --- litellm/proxy/_types.py | 3 +-- litellm/proxy/schema.prisma | 1 + litellm/proxy/utils.py | 5 +++++ litellm/tests/test_key_generate_prisma.py | 4 ++++ schema.prisma | 1 + 5 files changed, 12 insertions(+), 2 deletions(-) diff --git a/litellm/proxy/_types.py b/litellm/proxy/_types.py index 8a059c507..670cefcf2 100644 --- a/litellm/proxy/_types.py +++ b/litellm/proxy/_types.py @@ -343,8 +343,7 @@ class LiteLLM_SpendLogs(LiteLLMBase): endTime: Union[str, datetime, None] user: Optional[str] = "" modelParameters: Optional[Json] = {} - messages: Optional[Json] = [] - response: Optional[Json] = {} usage: Optional[Json] = {} metadata: Optional[Json] = {} cache_hit: Optional[str] = "False" + cache_key: Optional[str] = None diff --git a/litellm/proxy/schema.prisma b/litellm/proxy/schema.prisma index 441c3515f..f06d42ba5 100644 --- a/litellm/proxy/schema.prisma +++ b/litellm/proxy/schema.prisma @@ -58,4 +58,5 @@ model LiteLLM_SpendLogs { usage Json @default("{}") metadata Json @default("{}") cache_hit String @default("") + cache_key String @default("") } \ No newline at end of file diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 15f230a6a..d49ace138 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -995,6 +995,10 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time): if api_key is not None and isinstance(api_key, str) and api_key.startswith("sk-"): # hash the api_key api_key = hash_token(api_key) + from litellm.caching import Cache + + c = Cache() + cache_key = c.get_cache_key(**kwargs) if "headers" in metadata and "authorization" in metadata["headers"]: metadata["headers"].pop( @@ -1013,6 +1017,7 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time): "modelParameters": optional_params, "usage": usage, "metadata": metadata, + "cache_key": cache_key, } json_fields = [ diff --git a/litellm/tests/test_key_generate_prisma.py b/litellm/tests/test_key_generate_prisma.py index 49f091cd6..f7f1d0a91 100644 --- a/litellm/tests/test_key_generate_prisma.py +++ b/litellm/tests/test_key_generate_prisma.py @@ -763,6 +763,10 @@ def test_call_with_key_over_budget(prisma_client): assert spend_log.request_id == request_id assert spend_log.spend == float("2e-05") assert spend_log.model == "chatgpt-v-2" + assert ( + spend_log.cache_key + == "a61ae14fe4a8b8014a61e6ae01a100c8bc6770ac37c293242afed954bc69207d" + ) # use generated key to auth in result = await user_api_key_auth(request=request, api_key=bearer_token) diff --git a/schema.prisma b/schema.prisma index 0dd11eb64..72d14e13b 100644 --- a/schema.prisma +++ b/schema.prisma @@ -61,4 +61,5 @@ model LiteLLM_SpendLogs { usage Json @default("{}") metadata Json @default("{}") cache_hit String @default("") + cache_key String @default("") } \ No newline at end of file From bf851ef19a47aeb1c76a56f16fba7cb29d873f2a Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Wed, 24 Jan 2024 18:34:22 -0800 Subject: [PATCH 10/15] (fix) use litellm.cache for getting key --- litellm/proxy/proxy_server.py | 2 -- litellm/proxy/utils.py | 9 ++++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 512e956b0..eaa2373cc 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -1382,8 +1382,6 @@ async def initialize( verbose_proxy_logger.setLevel( level=logging.DEBUG ) # set proxy logs to debug - litellm.set_verbose = True - dynamic_config = {"general": {}, user_model: {}} if config: ( diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index d49ace138..812157ca0 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -995,15 +995,14 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time): if api_key is not None and isinstance(api_key, str) and api_key.startswith("sk-"): # hash the api_key api_key = hash_token(api_key) - from litellm.caching import Cache - - c = Cache() - cache_key = c.get_cache_key(**kwargs) - if "headers" in metadata and "authorization" in metadata["headers"]: metadata["headers"].pop( "authorization" ) # do not store the original `sk-..` api key in the db + if litellm.cache is not None: + cache_key = litellm.cache.get_cache_key(**kwargs) + else: + cache_key = "Cache OFF" payload = { "request_id": id, From 2f3765a03f5ff659c8292edf3e0491297e477fe7 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Wed, 24 Jan 2024 18:51:39 -0800 Subject: [PATCH 11/15] (fix) log cache hits on SpendLogs table --- litellm/proxy/proxy_server.py | 6 ++++++ litellm/proxy/utils.py | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index eaa2373cc..e1ca25e13 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -593,6 +593,12 @@ async def track_cost_callback( "user_api_key_user_id", None ) + if kwargs.get("cache_hit", False) == True: + response_cost = 0.0 + verbose_proxy_logger.info( + f"Cache Hit: response_cost {response_cost}, for user_id {user_id}" + ) + verbose_proxy_logger.info( f"response_cost {response_cost}, for user_id {user_id}" ) diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 812157ca0..25c5c82ce 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -1003,6 +1003,10 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time): cache_key = litellm.cache.get_cache_key(**kwargs) else: cache_key = "Cache OFF" + if cache_hit == True: + import time + + id = f"{id}_cache_hit{time.time()}" # SpendLogs does not allow duplicate request_id payload = { "request_id": id, From 6bc715cf85f9d9e18dc67ecf2bdffe90ef05d022 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Wed, 24 Jan 2024 18:54:23 -0800 Subject: [PATCH 12/15] (test) logging cache_key in spendLogs --- litellm/tests/test_key_generate_prisma.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/litellm/tests/test_key_generate_prisma.py b/litellm/tests/test_key_generate_prisma.py index f7f1d0a91..78fb756b2 100644 --- a/litellm/tests/test_key_generate_prisma.py +++ b/litellm/tests/test_key_generate_prisma.py @@ -716,6 +716,9 @@ def test_call_with_key_over_budget(prisma_client): # update spend using track_cost callback, make 2nd request, it should fail from litellm.proxy.proxy_server import track_cost_callback from litellm import ModelResponse, Choices, Message, Usage + from litellm.caching import Cache + + litellm.cache = Cache() import time request_id = f"chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac{time.time()}" From 3e59a02dfb451f52c6e3b2a534165ea092c8f38d Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Wed, 24 Jan 2024 19:27:53 -0800 Subject: [PATCH 13/15] (test) test /key/gen with max_budget=None --- litellm/tests/test_key_generate_prisma.py | 70 +++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/litellm/tests/test_key_generate_prisma.py b/litellm/tests/test_key_generate_prisma.py index 78fb756b2..5efef3932 100644 --- a/litellm/tests/test_key_generate_prisma.py +++ b/litellm/tests/test_key_generate_prisma.py @@ -783,6 +783,76 @@ def test_call_with_key_over_budget(prisma_client): print(vars(e)) +@pytest.mark.asyncio() +async def test_call_with_key_never_over_budget(prisma_client): + # Make a call with a key with budget=None, it should never fail + setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client) + setattr(litellm.proxy.proxy_server, "master_key", "sk-1234") + try: + await litellm.proxy.proxy_server.prisma_client.connect() + request = GenerateKeyRequest(max_budget=None) + key = await generate_key_fn(request) + print(key) + + generated_key = key.key + user_id = key.user_id + bearer_token = "Bearer " + generated_key + + request = Request(scope={"type": "http"}) + request._url = URL(url="/chat/completions") + + # use generated key to auth in + result = await user_api_key_auth(request=request, api_key=bearer_token) + print("result from user auth with new key", result) + + # update spend using track_cost callback, make 2nd request, it should fail + from litellm.proxy.proxy_server import track_cost_callback + from litellm import ModelResponse, Choices, Message, Usage + import time + + request_id = f"chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac{time.time()}" + + resp = ModelResponse( + id=request_id, + choices=[ + Choices( + finish_reason=None, + index=0, + message=Message( + content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a", + role="assistant", + ), + ) + ], + model="gpt-35-turbo", # azure always has model written like this + usage=Usage( + prompt_tokens=210000, completion_tokens=200000, total_tokens=41000 + ), + ) + await track_cost_callback( + kwargs={ + "model": "chatgpt-v-2", + "stream": False, + "litellm_params": { + "metadata": { + "user_api_key": generated_key, + "user_api_key_user_id": user_id, + } + }, + "response_cost": 200000, + }, + completion_response=resp, + start_time=datetime.now(), + end_time=datetime.now(), + ) + + # use generated key to auth in + result = await user_api_key_auth(request=request, api_key=bearer_token) + print("result from user auth with new key", result) + except Exception as e: + pytest.fail(f"This should have not failed!. They key uses max_budget=None. {e}") + + @pytest.mark.asyncio() async def test_call_with_key_over_budget_stream(prisma_client): # 14. Make a call with a key over budget, expect to fail From 43f139fafd8e69d81c5fd5d8f95d511e0953c36f Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Wed, 24 Jan 2024 20:09:08 -0800 Subject: [PATCH 14/15] fix(ollama_chat.py): fix default token counting for ollama chat --- litellm/llms/ollama_chat.py | 12 ++++++++---- litellm/utils.py | 9 +++++++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py index 31e3f0d16..e381c93f7 100644 --- a/litellm/llms/ollama_chat.py +++ b/litellm/llms/ollama_chat.py @@ -220,8 +220,10 @@ def get_ollama_response( model_response["choices"][0]["message"] = response_json["message"] model_response["created"] = int(time.time()) model_response["model"] = "ollama/" + model - prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt))) # type: ignore - completion_tokens = response_json["eval_count"] + prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=messages)) # type: ignore + completion_tokens = response_json.get( + "eval_count", litellm.token_counter(text=response_json["message"]) + ) model_response["usage"] = litellm.Usage( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, @@ -320,8 +322,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj): model_response["choices"][0]["message"] = response_json["message"] model_response["created"] = int(time.time()) model_response["model"] = "ollama/" + data["model"] - prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt))) # type: ignore - completion_tokens = response_json["eval_count"] + prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"])) # type: ignore + completion_tokens = response_json.get( + "eval_count", litellm.token_counter(text=response_json["message"]) + ) model_response["usage"] = litellm.Usage( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, diff --git a/litellm/utils.py b/litellm/utils.py index 03d38ff35..4718083c2 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2872,8 +2872,13 @@ def token_counter( print_verbose( f"Token Counter - using generic token counter, for model={model}" ) - enc = tokenizer_json["tokenizer"].encode(text) - num_tokens = len(enc) + num_tokens = openai_token_counter( + text=text, # type: ignore + model="gpt-3.5-turbo", + messages=messages, + is_tool_call=is_tool_call, + count_response_tokens=count_response_tokens, + ) else: num_tokens = len(encoding.encode(text)) # type: ignore return num_tokens From b1864c3d115d0148755c39b7d1e532a54691a601 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Wed, 24 Jan 2024 20:10:14 -0800 Subject: [PATCH 15/15] =?UTF-8?q?bump:=20version=201.19.0=20=E2=86=92=201.?= =?UTF-8?q?19.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f8d15f17d..fb8507d8c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.19.0" +version = "1.19.1" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -63,7 +63,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.19.0" +version = "1.19.1" version_files = [ "pyproject.toml:^version" ]