From 09ec6d645851fcc62b2851eb4b421a2a77e89468 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jan 2024 12:49:45 -0800 Subject: [PATCH 01/27] fix(utils.py): fix sagemaker async logging for sync streaming https://github.com/BerriAI/litellm/issues/1592 --- .circleci/config.yml | 3 + litellm/llms/sagemaker.py | 35 +++++--- litellm/main.py | 11 +-- litellm/proxy/proxy_server.py | 3 + litellm/proxy/utils.py | 5 +- litellm/tests/test_custom_callback_input.py | 41 +++++++++ litellm/tests/test_streaming.py | 70 ++++++++------- litellm/utils.py | 94 ++++++++++++++++++--- proxy_server_config.yaml | 4 + tests/test_keys.py | 45 +++++++++- 10 files changed, 247 insertions(+), 64 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 1de72a156..e0e6f5743 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -147,6 +147,9 @@ jobs: -e AZURE_API_KEY=$AZURE_API_KEY \ -e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \ -e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \ + -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ + -e AWS_REGION_NAME=$AWS_REGION_NAME \ --name my-app \ -v $(pwd)/proxy_server_config.yaml:/app/config.yaml \ my-app:latest \ diff --git a/litellm/llms/sagemaker.py b/litellm/llms/sagemaker.py index 1608f7a0f..78aafe7f7 100644 --- a/litellm/llms/sagemaker.py +++ b/litellm/llms/sagemaker.py @@ -34,22 +34,35 @@ class TokenIterator: self.byte_iterator = iter(stream) self.buffer = io.BytesIO() self.read_pos = 0 + self.end_of_data = False def __iter__(self): return self def __next__(self): - while True: - self.buffer.seek(self.read_pos) - line = self.buffer.readline() - if line and line[-1] == ord("\n"): - self.read_pos += len(line) + 1 - full_line = line[:-1].decode("utf-8") - line_data = json.loads(full_line.lstrip("data:").rstrip("/n")) - return line_data["token"]["text"] - chunk = next(self.byte_iterator) - self.buffer.seek(0, io.SEEK_END) - self.buffer.write(chunk["PayloadPart"]["Bytes"]) + try: + while True: + self.buffer.seek(self.read_pos) + line = self.buffer.readline() + if line and line[-1] == ord("\n"): + response_obj = {"text": "", "is_finished": False} + self.read_pos += len(line) + 1 + full_line = line[:-1].decode("utf-8") + line_data = json.loads(full_line.lstrip("data:").rstrip("/n")) + if line_data.get("generated_text", None) is not None: + self.end_of_data = True + response_obj["is_finished"] = True + response_obj["text"] = line_data["token"]["text"] + return response_obj + chunk = next(self.byte_iterator) + self.buffer.seek(0, io.SEEK_END) + self.buffer.write(chunk["PayloadPart"]["Bytes"]) + except StopIteration as e: + if self.end_of_data == True: + raise e # Re-raise StopIteration + else: + self.end_of_data = True + return "data: [DONE]" class SagemakerConfig: diff --git a/litellm/main.py b/litellm/main.py index 6b9a0bb18..fca3bd2b2 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -1514,11 +1514,6 @@ def completion( if ( "stream" in optional_params and optional_params["stream"] == True ): ## [BETA] - # sagemaker does not support streaming as of now so we're faking streaming: - # https://discuss.huggingface.co/t/streaming-output-text-when-deploying-on-sagemaker/39611 - # "SageMaker is currently not supporting streaming responses." - - # fake streaming for sagemaker print_verbose(f"ENTERS SAGEMAKER CUSTOMSTREAMWRAPPER") from .llms.sagemaker import TokenIterator @@ -1529,6 +1524,12 @@ def completion( custom_llm_provider="sagemaker", logging_obj=logging, ) + ## LOGGING + logging.post_call( + input=messages, + api_key=None, + original_response=response, + ) return response ## RESPONSE OBJECT diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index b53484b86..493ad9731 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -690,6 +690,9 @@ async def update_database( existing_spend_obj = await custom_db_client.get_data( key=id, table_name="user" ) + verbose_proxy_logger.debug( + f"Updating existing_spend_obj: {existing_spend_obj}" + ) if existing_spend_obj is None: existing_spend = 0 existing_spend_obj = LiteLLM_UserTable( diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index faa73d70b..728716886 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -409,7 +409,9 @@ class PrismaClient: hashed_token = token if token.startswith("sk-"): hashed_token = self.hash_token(token=token) - print_verbose("PrismaClient: find_unique") + verbose_proxy_logger.debug( + f"PrismaClient: find_unique for token: {hashed_token}" + ) if query_type == "find_unique": response = await self.db.litellm_verificationtoken.find_unique( where={"token": hashed_token} @@ -716,7 +718,6 @@ class PrismaClient: Batch write update queries """ batcher = self.db.batch_() - verbose_proxy_logger.debug(f"data list for user table: {data_list}") for idx, user in enumerate(data_list): try: data_json = self.jsonify_object(data=user.model_dump()) diff --git a/litellm/tests/test_custom_callback_input.py b/litellm/tests/test_custom_callback_input.py index 556628d82..a61cc843e 100644 --- a/litellm/tests/test_custom_callback_input.py +++ b/litellm/tests/test_custom_callback_input.py @@ -556,6 +556,47 @@ async def test_async_chat_bedrock_stream(): # asyncio.run(test_async_chat_bedrock_stream()) + +## Test Sagemaker + Async +@pytest.mark.asyncio +async def test_async_chat_sagemaker_stream(): + try: + customHandler = CompletionCustomHandler() + litellm.callbacks = [customHandler] + response = await litellm.acompletion( + model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4", + messages=[{"role": "user", "content": "Hi 👋 - i'm async sagemaker"}], + ) + # test streaming + response = await litellm.acompletion( + model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4", + messages=[{"role": "user", "content": "Hi 👋 - i'm async sagemaker"}], + stream=True, + ) + print(f"response: {response}") + async for chunk in response: + print(f"chunk: {chunk}") + continue + ## test failure callback + try: + response = await litellm.acompletion( + model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4", + messages=[{"role": "user", "content": "Hi 👋 - i'm async sagemaker"}], + aws_region_name="my-bad-key", + stream=True, + ) + async for chunk in response: + continue + except: + pass + time.sleep(1) + print(f"customHandler.errors: {customHandler.errors}") + assert len(customHandler.errors) == 0 + litellm.callbacks = [] + except Exception as e: + pytest.fail(f"An exception occurred: {str(e)}") + + # Text Completion diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index 14b1a7210..d9f99bece 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -872,41 +872,53 @@ async def test_sagemaker_streaming_async(): ) # Add any assertions here to check the response + print(response) complete_response = "" + has_finish_reason = False + # Add any assertions here to check the response + idx = 0 async for chunk in response: - complete_response += chunk.choices[0].delta.content or "" - print(f"complete_response: {complete_response}") - assert len(complete_response) > 0 + # print + chunk, finished = streaming_format_tests(idx, chunk) + has_finish_reason = finished + complete_response += chunk + if finished: + break + idx += 1 + if has_finish_reason is False: + raise Exception("finish reason not set for last chunk") + if complete_response.strip() == "": + raise Exception("Empty response received") + print(f"completion_response: {complete_response}") except Exception as e: pytest.fail(f"An exception occurred - {str(e)}") -# def test_completion_sagemaker_stream(): -# try: -# response = completion( -# model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", -# messages=messages, -# temperature=0.2, -# max_tokens=80, -# stream=True, -# ) -# complete_response = "" -# has_finish_reason = False -# # Add any assertions here to check the response -# for idx, chunk in enumerate(response): -# chunk, finished = streaming_format_tests(idx, chunk) -# has_finish_reason = finished -# if finished: -# break -# complete_response += chunk -# if has_finish_reason is False: -# raise Exception("finish reason not set for last chunk") -# if complete_response.strip() == "": -# raise Exception("Empty response received") -# except InvalidRequestError as e: -# pass -# except Exception as e: -# pytest.fail(f"Error occurred: {e}") +def test_completion_sagemaker_stream(): + try: + response = completion( + model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4", + messages=messages, + temperature=0.2, + max_tokens=80, + stream=True, + ) + complete_response = "" + has_finish_reason = False + # Add any assertions here to check the response + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + has_finish_reason = finished + if finished: + break + complete_response += chunk + if has_finish_reason is False: + raise Exception("finish reason not set for last chunk") + if complete_response.strip() == "": + raise Exception("Empty response received") + except Exception as e: + pytest.fail(f"Error occurred: {e}") + # test_completion_sagemaker_stream() diff --git a/litellm/utils.py b/litellm/utils.py index 0e12463b9..fb3210b1d 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -1417,7 +1417,9 @@ class Logging: """ Implementing async callbacks, to handle asyncio event loop issues when custom integrations need to use async functions. """ - print_verbose(f"Async success callbacks: {litellm._async_success_callback}") + verbose_logger.debug( + f"Async success callbacks: {litellm._async_success_callback}" + ) start_time, end_time, result = self._success_handler_helper_fn( start_time=start_time, end_time=end_time, result=result, cache_hit=cache_hit ) @@ -1426,7 +1428,7 @@ class Logging: if self.stream: if result.choices[0].finish_reason is not None: # if it's the last chunk self.streaming_chunks.append(result) - # print_verbose(f"final set of received chunks: {self.streaming_chunks}") + # verbose_logger.debug(f"final set of received chunks: {self.streaming_chunks}") try: complete_streaming_response = litellm.stream_chunk_builder( self.streaming_chunks, @@ -1435,14 +1437,16 @@ class Logging: end_time=end_time, ) except Exception as e: - print_verbose( + verbose_logger.debug( f"Error occurred building stream chunk: {traceback.format_exc()}" ) complete_streaming_response = None else: self.streaming_chunks.append(result) if complete_streaming_response is not None: - print_verbose("Async success callbacks: Got a complete streaming response") + verbose_logger.debug( + "Async success callbacks: Got a complete streaming response" + ) self.model_call_details[ "complete_streaming_response" ] = complete_streaming_response @@ -7682,6 +7686,27 @@ class CustomStreamWrapper: } return "" + def handle_sagemaker_stream(self, chunk): + if "data: [DONE]" in chunk: + text = "" + is_finished = True + finish_reason = "stop" + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + elif isinstance(chunk, dict): + if chunk["is_finished"] == True: + finish_reason = "stop" + else: + finish_reason = "" + return { + "text": chunk["text"], + "is_finished": chunk["is_finished"], + "finish_reason": finish_reason, + } + def chunk_creator(self, chunk): model_response = ModelResponse(stream=True, model=self.model) if self.response_id is not None: @@ -7807,8 +7832,14 @@ class CustomStreamWrapper: ] self.sent_last_chunk = True elif self.custom_llm_provider == "sagemaker": - print_verbose(f"ENTERS SAGEMAKER STREAMING for chunk {chunk}") - completion_obj["content"] = chunk + verbose_logger.debug(f"ENTERS SAGEMAKER STREAMING for chunk {chunk}") + response_obj = self.handle_sagemaker_stream(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + model_response.choices[0].finish_reason = response_obj[ + "finish_reason" + ] + self.sent_last_chunk = True elif self.custom_llm_provider == "petals": if len(self.completion_stream) == 0: if self.sent_last_chunk: @@ -7984,6 +8015,19 @@ class CustomStreamWrapper: original_exception=e, ) + def run_success_logging_in_thread(self, processed_chunk): + # Create an event loop for the new thread + ## ASYNC LOGGING + # Run the asynchronous function in the new thread's event loop + asyncio.run( + self.logging_obj.async_success_handler( + processed_chunk, + ) + ) + + ## SYNC LOGGING + self.logging_obj.success_handler(processed_chunk) + ## needs to handle the empty string case (even starting chunk can be an empty string) def __next__(self): try: @@ -8002,8 +8046,9 @@ class CustomStreamWrapper: continue ## LOGGING threading.Thread( - target=self.logging_obj.success_handler, args=(response,) + target=self.run_success_logging_in_thread, args=(response,) ).start() # log response + # RETURN RESULT return response except StopIteration: @@ -8059,13 +8104,34 @@ class CustomStreamWrapper: raise StopAsyncIteration else: # temporary patch for non-aiohttp async calls # example - boto3 bedrock llms - processed_chunk = next(self) - asyncio.create_task( - self.logging_obj.async_success_handler( - processed_chunk, - ) - ) - return processed_chunk + while True: + if isinstance(self.completion_stream, str) or isinstance( + self.completion_stream, bytes + ): + chunk = self.completion_stream + else: + chunk = next(self.completion_stream) + if chunk is not None and chunk != b"": + print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}") + processed_chunk = self.chunk_creator(chunk=chunk) + print_verbose( + f"PROCESSED CHUNK POST CHUNK CREATOR: {processed_chunk}" + ) + if processed_chunk is None: + continue + ## LOGGING + threading.Thread( + target=self.logging_obj.success_handler, + args=(processed_chunk,), + ).start() # log processed_chunk + asyncio.create_task( + self.logging_obj.async_success_handler( + processed_chunk, + ) + ) + + # RETURN RESULT + return processed_chunk except StopAsyncIteration: raise except StopIteration: diff --git a/proxy_server_config.yaml b/proxy_server_config.yaml index dfa8e1151..2c123d156 100644 --- a/proxy_server_config.yaml +++ b/proxy_server_config.yaml @@ -11,6 +11,10 @@ model_list: api_base: https://openai-gpt-4-test-v-1.openai.azure.com/ api_version: "2023-05-15" api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault + - model_name: sagemaker-completion-model + litellm_params: + model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4 + input_cost_per_second: 0.000420 - model_name: gpt-4 litellm_params: model: azure/gpt-turbo diff --git a/tests/test_keys.py b/tests/test_keys.py index f05204c03..cb06e1f7e 100644 --- a/tests/test_keys.py +++ b/tests/test_keys.py @@ -13,17 +13,21 @@ sys.path.insert( import litellm -async def generate_key(session, i, budget=None, budget_duration=None): +async def generate_key( + session, i, budget=None, budget_duration=None, models=["azure-models", "gpt-4"] +): url = "http://0.0.0.0:4000/key/generate" headers = {"Authorization": "Bearer sk-1234", "Content-Type": "application/json"} data = { - "models": ["azure-models", "gpt-4"], + "models": models, "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": None, "max_budget": budget, "budget_duration": budget_duration, } + print(f"data: {data}") + async with session.post(url, headers=headers, json=data) as response: status = response.status response_text = await response.text() @@ -293,7 +297,7 @@ async def test_key_info_spend_values(): rounded_response_cost = round(response_cost, 8) rounded_key_info_spend = round(key_info["info"]["spend"], 8) assert rounded_response_cost == rounded_key_info_spend - ## streaming + ## streaming - azure key_gen = await generate_key(session=session, i=0) new_key = key_gen["key"] prompt_tokens, completion_tokens = await chat_completion_streaming( @@ -318,6 +322,41 @@ async def test_key_info_spend_values(): assert rounded_response_cost == rounded_key_info_spend +@pytest.mark.asyncio +async def test_key_info_spend_values_sagemaker(): + """ + Tests the sync streaming loop to ensure spend is correctly calculated. + - create key + - make completion call + - assert cost is expected value + """ + async with aiohttp.ClientSession() as session: + ## streaming - sagemaker + key_gen = await generate_key(session=session, i=0, models=[]) + new_key = key_gen["key"] + prompt_tokens, completion_tokens = await chat_completion_streaming( + session=session, key=new_key, model="sagemaker-completion-model" + ) + # print(f"prompt_tokens: {prompt_tokens}, completion_tokens: {completion_tokens}") + # prompt_cost, completion_cost = litellm.cost_per_token( + # model="azure/gpt-35-turbo", + # prompt_tokens=prompt_tokens, + # completion_tokens=completion_tokens, + # ) + # response_cost = prompt_cost + completion_cost + await asyncio.sleep(5) # allow db log to be updated + key_info = await get_key_info( + session=session, get_key=new_key, call_key=new_key + ) + # print( + # f"response_cost: {response_cost}; key_info spend: {key_info['info']['spend']}" + # ) + # rounded_response_cost = round(response_cost, 8) + rounded_key_info_spend = round(key_info["info"]["spend"], 8) + assert rounded_key_info_spend > 0 + # assert rounded_response_cost == rounded_key_info_spend + + @pytest.mark.asyncio async def test_key_with_budgets(): """ From bbe6a92eb999bdfd97149286d59d1c37f72178e8 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jan 2024 14:51:08 -0800 Subject: [PATCH 02/27] fix(main.py): fix order of assembly for streaming chunks --- litellm/main.py | 6 ++++++ litellm/tests/test_custom_logger.py | 2 +- litellm/utils.py | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/litellm/main.py b/litellm/main.py index fca3bd2b2..6b4035473 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -3343,6 +3343,12 @@ def stream_chunk_builder( chunks: list, messages: Optional[list] = None, start_time=None, end_time=None ): model_response = litellm.ModelResponse() + ### SORT CHUNKS BASED ON CREATED ORDER ## + if chunks[0]._hidden_params.get("created_at", None): + # Sort chunks based on created_at in ascending order + chunks = sorted( + chunks, key=lambda x: x._hidden_params.get("created_at", float("inf")) + ) # set hidden params from chunk to model_response if model_response is not None and hasattr(model_response, "_hidden_params"): model_response._hidden_params = chunks[0].get("_hidden_params", {}) diff --git a/litellm/tests/test_custom_logger.py b/litellm/tests/test_custom_logger.py index 565df5b25..e403c3afe 100644 --- a/litellm/tests/test_custom_logger.py +++ b/litellm/tests/test_custom_logger.py @@ -211,7 +211,7 @@ def test_azure_completion_stream(): {"role": "system", "content": "You are a helpful assistant."}, { "role": "user", - "content": "write 1 sentence about litellm being amazing", + "content": f"write 1 sentence about litellm being amazing {time.time()}", }, ] complete_streaming_response = "" diff --git a/litellm/utils.py b/litellm/utils.py index fb3210b1d..02ac83d06 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -7714,6 +7714,7 @@ class CustomStreamWrapper: else: self.response_id = model_response.id model_response._hidden_params["custom_llm_provider"] = self.custom_llm_provider + model_response._hidden_params["created_at"] = time.time() model_response.choices = [StreamingChoices()] model_response.choices[0].finish_reason = None response_obj = {} From 72275ad8cb1a8fe7558726cacda6289a76cdb559 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jan 2024 15:59:53 -0800 Subject: [PATCH 03/27] fix(main.py): fix logging event loop for async logging but sync streaming --- litellm/main.py | 12 ++++-------- litellm/utils.py | 21 +++++++++++++++------ 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/litellm/main.py b/litellm/main.py index 6b4035473..89750ef46 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -274,14 +274,10 @@ async def acompletion( else: # Call the synchronous function using run_in_executor response = await loop.run_in_executor(None, func_with_context) # type: ignore - # if kwargs.get("stream", False): # return an async generator - # return _async_streaming( - # response=response, - # model=model, - # custom_llm_provider=custom_llm_provider, - # args=args, - # ) - # else: + if isinstance(response, CustomStreamWrapper): + response.set_logging_event_loop( + loop=loop + ) # sets the logging event loop if the user does sync streaming (e.g. on proxy for sagemaker calls) return response except Exception as e: custom_llm_provider = custom_llm_provider or "openai" diff --git a/litellm/utils.py b/litellm/utils.py index 02ac83d06..2bc1d34e9 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -7116,6 +7116,7 @@ class CustomStreamWrapper: "model_id": (_model_info.get("id", None)) } # returned as x-litellm-model-id response header in proxy self.response_id = None + self.logging_loop = None def __iter__(self): return self @@ -8016,16 +8017,24 @@ class CustomStreamWrapper: original_exception=e, ) + def set_logging_event_loop(self, loop): + self.logging_loop = loop + + async def your_async_function(self): + # Your asynchronous code here + return "Your asynchronous code is running" + def run_success_logging_in_thread(self, processed_chunk): # Create an event loop for the new thread ## ASYNC LOGGING - # Run the asynchronous function in the new thread's event loop - asyncio.run( - self.logging_obj.async_success_handler( - processed_chunk, + if self.logging_loop is not None: + future = asyncio.run_coroutine_threadsafe( + self.logging_obj.async_success_handler(processed_chunk), + loop=self.logging_loop, ) - ) - + result = future.result() + else: + asyncio.run(self.logging_obj.async_success_handler(processed_chunk)) ## SYNC LOGGING self.logging_obj.success_handler(processed_chunk) From 565531fe9e54ec8d0cd9962fb44d82c0ec673d44 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 09:58:43 -0800 Subject: [PATCH 04/27] v0 basic structure --- litellm/proxy/proxy_server.py | 26 ++++++++++++++++++++++++++ litellm/proxy/utils.py | 10 ++++++++++ 2 files changed, 36 insertions(+) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 493ad9731..ca8ad027d 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -407,6 +407,14 @@ async def user_api_key_auth( user_max_budget is not None and user_current_spend is not None ): + asyncio.create_task( + proxy_logging_obj.budget_alerts( + user_max_budget=user_max_budget, + user_current_spend=user_current_spend, + type="user_and_proxy_budget", + user_info=_user, + ) + ) if user_current_spend > user_max_budget: raise Exception( f"ExceededBudget: User {valid_token.user_id} has exceeded their budget. Current spend: {user_current_spend}; Max Budget: {user_max_budget}" @@ -422,6 +430,15 @@ async def user_api_key_auth( user_max_budget is not None and user_current_spend is not None ): + asyncio.create_task( + proxy_logging_obj.budget_alerts( + user_max_budget=user_max_budget, + user_current_spend=user_current_spend, + type="user_budget", + user_info=user_id_information, + ) + ) + if user_current_spend > user_max_budget: raise Exception( f"ExceededBudget: User {valid_token.user_id} has exceeded their budget. Current spend: {user_current_spend}; Max Budget: {user_max_budget}" @@ -448,6 +465,15 @@ async def user_api_key_auth( # Check 4. Token Spend is under budget if valid_token.spend is not None and valid_token.max_budget is not None: + asyncio.create_task( + proxy_logging_obj.budget_alerts( + user_max_budget=valid_token.max_budget, + user_current_spend=valid_token.spend, + type="token_budget", + user_info=valid_token, + ) + ) + if valid_token.spend > valid_token.max_budget: raise Exception( f"ExceededTokenBudget: Current spend for token: {valid_token.spend}; Max Budget for Token: {valid_token.max_budget}" diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 728716886..9f7dd1c87 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -181,6 +181,14 @@ class ProxyLogging: level="Low", ) + async def budget_alerts( + self, + type: Literal["token_budget", "user_budget", "user_and_proxy_budget"], + user_max_budget: float, + user_current_spend: float, + ): + pass + async def alerting_handler( self, message: str, level: Literal["Low", "Medium", "High"] ): @@ -191,6 +199,8 @@ class ProxyLogging: - Requests are hanging - Calls are failing - DB Read/Writes are failing + - Proxy Close to max budget + - Key Close to max budget Parameters: level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'. From 55115a75b02f79b978976fdf26084d8844b4bf1c Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 10:01:32 -0800 Subject: [PATCH 05/27] (feat) alerts proxy budgets --- litellm/proxy/utils.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 9f7dd1c87..ab14411bb 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -186,7 +186,23 @@ class ProxyLogging: type: Literal["token_budget", "user_budget", "user_and_proxy_budget"], user_max_budget: float, user_current_spend: float, + user_info=None, ): + # percent of max_budget left to spend + percent_left = (user_max_budget - user_current_spend) / user_max_budget + + # check if 15% of max budget is left + if percent_left <= 0.15: + pass + + # check if 5% of max budget is left + if percent_left <= 0.05: + pass + + # check if crossed budget + if user_current_spend >= user_max_budget: + pass + pass async def alerting_handler( From 7a2a7e047f6c8fdc72d006814e42f0654775d71e Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 11:18:06 -0800 Subject: [PATCH 06/27] (feat) slack alerting budgets --- litellm/proxy/utils.py | 45 +++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index ab14411bb..f176687f9 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -188,20 +188,47 @@ class ProxyLogging: user_current_spend: float, user_info=None, ): + if type == "user_and_proxy_budget": + user_info = dict(user_info) + user_id = user_info["user_id"] + max_budget = user_info["max_budget"] + spend = user_info["spend"] + user_email = user_info["user_email"] + user_info = f"""\nUser ID: {user_id}\nMax Budget: {max_budget}\nSpend: {spend}\nUser Email: {user_email}""" + else: + user_info = str(user_info) # percent of max_budget left to spend percent_left = (user_max_budget - user_current_spend) / user_max_budget - - # check if 15% of max budget is left - if percent_left <= 0.15: - pass - - # check if 5% of max budget is left - if percent_left <= 0.05: - pass + verbose_proxy_logger.debug( + f"Bduget Alerts: Percent left: {percent_left} for {user_info}" + ) # check if crossed budget if user_current_spend >= user_max_budget: - pass + message = "Budget Crossed for" + user_info + await self.alerting_handler( + message=message, + level="High", + ) + return + + # check if 5% of max budget is left + if percent_left <= 0.05: + message = "5 Percent budget left for" + user_info + await self.alerting_handler( + message=message, + level="Medium", + ) + return + + # check if 15% of max budget is left + if percent_left <= 0.15: + message = "15 Percent budget left for" + user_info + await self.alerting_handler( + message=message, + level="Low", + ) + return pass From 56f49a87bafe0d9bbc1c27147879679501e88612 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 11:32:05 -0800 Subject: [PATCH 07/27] (fix) raise exception budget_duration is set and max_budget is Not --- litellm/proxy/proxy_server.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index ca8ad027d..c2c15d7e9 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -1654,11 +1654,12 @@ async def startup_event(): user_id="default_user_id", ) - if ( - prisma_client is not None - and litellm.max_budget > 0 - and litellm.budget_duration is not None - ): + if prisma_client is not None and litellm.max_budget > 0: + if litellm.budget_duration is None: + raise Exception( + "budget_duration not set on Proxy. budget_duration is required to use max_budget." + ) + # add proxy budget to db in the user table await generate_key_helper_fn( user_id=litellm_proxy_budget_name, From 81c528f6ce678dfb59f7f56e64211ae20c9d9b21 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 11:39:57 -0800 Subject: [PATCH 08/27] (fix) raise correct error when proxy crossed budget --- litellm/proxy/proxy_server.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index c2c15d7e9..43e2ec5f5 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -415,9 +415,11 @@ async def user_api_key_auth( user_info=_user, ) ) + + _user_id = _user.get("user_id", None) if user_current_spend > user_max_budget: raise Exception( - f"ExceededBudget: User {valid_token.user_id} has exceeded their budget. Current spend: {user_current_spend}; Max Budget: {user_max_budget}" + f"ExceededBudget: User {_user_id} has exceeded their budget. Current spend: {user_current_spend}; Max Budget: {user_max_budget}" ) else: # Token exists, not expired now check if its in budget for the user From 229e4920dfcdedf7d4784fb35693ff6dffc23c09 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 11:40:20 -0800 Subject: [PATCH 09/27] (fix) better alert message on budgets --- litellm/proxy/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index f176687f9..9bef04034 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -214,7 +214,7 @@ class ProxyLogging: # check if 5% of max budget is left if percent_left <= 0.05: - message = "5 Percent budget left for" + user_info + message = "5% budget left for" + user_info await self.alerting_handler( message=message, level="Medium", @@ -223,7 +223,7 @@ class ProxyLogging: # check if 15% of max budget is left if percent_left <= 0.15: - message = "15 Percent budget left for" + user_info + message = "15% budget left for" + user_info await self.alerting_handler( message=message, level="Low", From 3a1c8f453f061b9af0d7aefe0486f59537bff05b Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 11:40:56 -0800 Subject: [PATCH 10/27] (docs) track max_budget on proxy config.yaml --- litellm/proxy/proxy_config.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index b06faac32..65aa21d04 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -61,6 +61,8 @@ model_list: litellm_settings: fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}] success_callback: ['langfuse'] + max_budget: 0.025 + budget_duration: 30d # cache: True # setting callback class # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance] From b8f917624f47c8d65daa7bd31e36cc36ffc7d2e0 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 11:41:35 -0800 Subject: [PATCH 11/27] (docs) config.yaml --- litellm/proxy/proxy_config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 65aa21d04..7cb2714f4 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -61,8 +61,8 @@ model_list: litellm_settings: fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}] success_callback: ['langfuse'] - max_budget: 0.025 - budget_duration: 30d + max_budget: 0.025 # global budget for proxy + budget_duration: 30d # global budget duration, will reset after 30d # cache: True # setting callback class # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance] From 1c9b02ad99e62044208b0617c659644a3b6be3fd Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 11:56:52 -0800 Subject: [PATCH 12/27] (fix) alerting debug statements --- litellm/proxy/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 9bef04034..4c6030e5b 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -200,11 +200,12 @@ class ProxyLogging: # percent of max_budget left to spend percent_left = (user_max_budget - user_current_spend) / user_max_budget verbose_proxy_logger.debug( - f"Bduget Alerts: Percent left: {percent_left} for {user_info}" + f"Budget Alerts: Percent left: {percent_left} for {user_info}" ) # check if crossed budget if user_current_spend >= user_max_budget: + verbose_proxy_logger.debug(f"Budget Crossed for {user_info}") message = "Budget Crossed for" + user_info await self.alerting_handler( message=message, @@ -230,7 +231,7 @@ class ProxyLogging: ) return - pass + return async def alerting_handler( self, message: str, level: Literal["Low", "Medium", "High"] From 5264a3eb53eb441e5f56d9b4b388b655021d8d33 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 11:58:55 -0800 Subject: [PATCH 13/27] (fix) do nothing if alerting is not switched on --- litellm/proxy/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 4c6030e5b..12605cf40 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -188,6 +188,10 @@ class ProxyLogging: user_current_spend: float, user_info=None, ): + if self.alerting is None: + # do nothing if alerting is not switched on + return + if type == "user_and_proxy_budget": user_info = dict(user_info) user_id = user_info["user_id"] From 7e1b9158fe3aec3f2379fbf9e9a01c385f35ae59 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 14:30:49 -0800 Subject: [PATCH 14/27] (test) embedding models --- litellm/tests/test_embedding.py | 39 +++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py index 630b41d72..18a6447e1 100644 --- a/litellm/tests/test_embedding.py +++ b/litellm/tests/test_embedding.py @@ -57,6 +57,45 @@ def test_openai_embedding(): # test_openai_embedding() +def test_openai_embedding_3(): + try: + litellm.set_verbose = True + response = embedding( + model="text-embedding-3-small", + input=["good morning from litellm", "this is another item"], + metadata={"anything": "good day"}, + ) + litellm_response = dict(response) + litellm_response_keys = set(litellm_response.keys()) + litellm_response_keys.discard("_response_ms") + + print(litellm_response_keys) + print("LiteLLM Response\n") + # print(litellm_response) + + # same request with OpenAI 1.0+ + import openai + + client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"]) + response = client.embeddings.create( + model="text-embedding-3-small", + input=["good morning from litellm", "this is another item"], + ) + + response = dict(response) + openai_response_keys = set(response.keys()) + print(openai_response_keys) + assert ( + litellm_response_keys == openai_response_keys + ) # ENSURE the Keys in litellm response is exactly what the openai package returns + assert ( + len(litellm_response["data"]) == 2 + ) # expect two embedding responses from litellm_response since input had two + print(openai_response_keys) + except Exception as e: + pytest.fail(f"Error occurred: {e}") + + def test_openai_azure_embedding_simple(): try: litellm.set_verbose = True From 4b15ae41f43ed53368b725991f21c76a8cf43588 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 14:34:15 -0800 Subject: [PATCH 15/27] (feat) add new OpenAI text-embedding-3 --- model_prices_and_context_window.json | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index 7e5f66990..3fe186908 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -143,6 +143,20 @@ "litellm_provider": "openai", "mode": "chat" }, + "text-embedding-3-large": { + "max_tokens": 8191, + "input_cost_per_token": 0.00000013, + "output_cost_per_token": 0.000000, + "litellm_provider": "openai", + "mode": "embedding" + }, + "text-embedding-3-small": { + "max_tokens": 8191, + "input_cost_per_token": 0.00000002, + "output_cost_per_token": 0.000000, + "litellm_provider": "openai", + "mode": "embedding" + }, "text-embedding-ada-002": { "max_tokens": 8191, "input_cost_per_token": 0.0000001, From c10bda3d30c5b2c1ec9dbdccfe2b01dffddacf2b Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 14:36:11 -0800 Subject: [PATCH 16/27] (chore) cleanup testing file --- litellm/tests/test_embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py index 18a6447e1..42ac6f7f9 100644 --- a/litellm/tests/test_embedding.py +++ b/litellm/tests/test_embedding.py @@ -225,7 +225,7 @@ def test_cohere_embedding3(): pytest.fail(f"Error occurred: {e}") -test_cohere_embedding3() +# test_cohere_embedding3() def test_bedrock_embedding_titan(): From 53961d641e96bfd0fb82bddd23f48cab5bc0f9c8 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 14:38:03 -0800 Subject: [PATCH 17/27] (docs) new OpenAI embedding models --- docs/my-website/docs/embedding/supported_embedding.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/my-website/docs/embedding/supported_embedding.md b/docs/my-website/docs/embedding/supported_embedding.md index 462cc1e70..735aa01c8 100644 --- a/docs/my-website/docs/embedding/supported_embedding.md +++ b/docs/my-website/docs/embedding/supported_embedding.md @@ -71,6 +71,8 @@ response = embedding('text-embedding-ada-002', input=["good morning from litellm | Model Name | Function Call | Required OS Variables | |----------------------|---------------------------------------------|--------------------------------------| +| text-embedding-3-small | `embedding('text-embedding-3-small', input)` | `os.environ['OPENAI_API_KEY']` | +| text-embedding-3-large | `embedding('text-embedding-3-large', input)` | `os.environ['OPENAI_API_KEY']` | | text-embedding-ada-002 | `embedding('text-embedding-ada-002', input)` | `os.environ['OPENAI_API_KEY']` | ## Azure OpenAI Embedding Models From c6a6deaa413ea2929cf1a2589bb0eee025fb17fd Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 14:41:46 -0800 Subject: [PATCH 18/27] (feat) add gpt-4-0125-preview --- model_prices_and_context_window.json | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index 3fe186908..458ac05a4 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -62,6 +62,15 @@ "litellm_provider": "openai", "mode": "chat" }, + "gpt-4-0125-preview": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00001, + "output_cost_per_token": 0.00003, + "litellm_provider": "openai", + "mode": "chat" + }, "gpt-4-vision-preview": { "max_tokens": 128000, "max_input_tokens": 128000, From e00f46a6e9c14924be833a822113fe2d397f25f7 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 14:42:10 -0800 Subject: [PATCH 19/27] (test) gpt-4-0125-preview --- litellm/tests/test_completion.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index b2c69804c..e24248bee 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -191,6 +191,21 @@ def test_completion_gpt4_turbo(): # test_completion_gpt4_turbo() +def test_completion_gpt4_turbo_0125(): + try: + response = completion( + model="gpt-4-0125-preview", + messages=messages, + max_tokens=10, + ) + print(response) + except openai.RateLimitError: + print("got a rate liimt error") + pass + except Exception as e: + pytest.fail(f"Error occurred: {e}") + + @pytest.mark.skip(reason="this test is flaky") def test_completion_gpt4_vision(): try: From 8ff00ad8d58d135944389a47bcb0d13231bdad6e Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 14:48:56 -0800 Subject: [PATCH 20/27] (docs) new gpt-4-0125-preview --- docs/my-website/docs/providers/openai.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/my-website/docs/providers/openai.md b/docs/my-website/docs/providers/openai.md index 1a515dea3..26f4a7d69 100644 --- a/docs/my-website/docs/providers/openai.md +++ b/docs/my-website/docs/providers/openai.md @@ -34,6 +34,7 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base" # OPTIONAL | Model Name | Function Call | |-----------------------|-----------------------------------------------------------------| +| gpt-4-0125-preview | `response = completion(model="gpt-4-0125-preview", messages=messages)` | | gpt-4-1106-preview | `response = completion(model="gpt-4-1106-preview", messages=messages)` | | gpt-3.5-turbo-1106 | `response = completion(model="gpt-3.5-turbo-1106", messages=messages)` | | gpt-3.5-turbo | `response = completion(model="gpt-3.5-turbo", messages=messages)` | From 5e7c43ebf74624662de56b8d80830e927cdfdb32 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 25 Jan 2024 14:50:51 -0800 Subject: [PATCH 21/27] =?UTF-8?q?bump:=20version=201.19.2=20=E2=86=92=201.?= =?UTF-8?q?19.3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6f20d92ee..567f08587 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.19.2" +version = "1.19.3" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -63,7 +63,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.19.2" +version = "1.19.3" version_files = [ "pyproject.toml:^version" ] From 014f83c847291008ec5c9a328e73eafb319ed01d Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jan 2024 15:00:51 -0800 Subject: [PATCH 22/27] fix(main.py): allow vertex ai project and location to be set in completion() call --- litellm/main.py | 12 +++++++++--- litellm/tests/test_amazing_vertex_completion.py | 4 +++- litellm/utils.py | 4 ++++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/litellm/main.py b/litellm/main.py index 89750ef46..ae5d675c6 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -1417,9 +1417,15 @@ def completion( return response response = model_response elif custom_llm_provider == "vertex_ai": - vertex_ai_project = litellm.vertex_project or get_secret("VERTEXAI_PROJECT") - vertex_ai_location = litellm.vertex_location or get_secret( - "VERTEXAI_LOCATION" + vertex_ai_project = ( + optional_params.pop("vertex_ai_project", None) + or litellm.vertex_project + or get_secret("VERTEXAI_PROJECT") + ) + vertex_ai_location = ( + optional_params.pop("vertex_ai_location", None) + or litellm.vertex_location + or get_secret("VERTEXAI_LOCATION") ) model_response = vertex_ai.completion( diff --git a/litellm/tests/test_amazing_vertex_completion.py b/litellm/tests/test_amazing_vertex_completion.py index 8467e4434..85c1cb933 100644 --- a/litellm/tests/test_amazing_vertex_completion.py +++ b/litellm/tests/test_amazing_vertex_completion.py @@ -95,7 +95,8 @@ def test_vertex_ai(): + litellm.vertex_code_text_models ) litellm.set_verbose = False - litellm.vertex_project = "reliablekeys" + vertex_ai_project = "reliablekeys" + # litellm.vertex_project = "reliablekeys" test_models = random.sample(test_models, 1) # test_models += litellm.vertex_language_models # always test gemini-pro @@ -117,6 +118,7 @@ def test_vertex_ai(): model=model, messages=[{"role": "user", "content": "hi"}], temperature=0.7, + vertex_ai_project=vertex_ai_project, ) print("\nModel Response", response) print(response) diff --git a/litellm/utils.py b/litellm/utils.py index 2bc1d34e9..63fab74cc 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -3351,6 +3351,10 @@ def get_optional_params( custom_llm_provider != "bedrock" and custom_llm_provider != "sagemaker" ): # allow dynamically setting boto3 init logic continue + elif ( + k.startswith("vertex_") and custom_llm_provider != "vertex_ai" + ): # allow dynamically setting vertex ai init logic + continue passed_params[k] = v default_params = { "functions": None, From 1ae22ea16db10d0e1e0dc8ffe0d32a5c863228eb Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jan 2024 16:06:01 -0800 Subject: [PATCH 23/27] refactor: trigger new bump --- litellm/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litellm/utils.py b/litellm/utils.py index 63fab74cc..033990896 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -10,6 +10,7 @@ import sys, re, binascii, struct import litellm import dotenv, json, traceback, threading, base64, ast + import subprocess, os import litellm, openai import itertools From 13776b1df75d9f3775af869cc80e42a54487cbf4 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jan 2024 16:06:12 -0800 Subject: [PATCH 24/27] =?UTF-8?q?bump:=20version=201.19.3=20=E2=86=92=201.?= =?UTF-8?q?19.4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 567f08587..82eab7fc2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.19.3" +version = "1.19.4" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -63,7 +63,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.19.3" +version = "1.19.4" version_files = [ "pyproject.toml:^version" ] From 39aec43b8660001869e60d49b64bff8376c44b61 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jan 2024 18:15:24 -0800 Subject: [PATCH 25/27] test(main.py): adding more logging --- litellm/main.py | 6 +++++- litellm/tests/test_custom_logger.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/litellm/main.py b/litellm/main.py index ae5d675c6..f9f1139f6 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -15,7 +15,7 @@ import dotenv, traceback, random, asyncio, time, contextvars from copy import deepcopy import httpx import litellm - +from ._logging import verbose_logger from litellm import ( # type: ignore client, exception_type, @@ -3346,11 +3346,15 @@ def stream_chunk_builder( ): model_response = litellm.ModelResponse() ### SORT CHUNKS BASED ON CREATED ORDER ## + print_verbose("Goes into checking if chunk has hiddden created at param") if chunks[0]._hidden_params.get("created_at", None): + print_verbose("Chunks have a created at hidden param") # Sort chunks based on created_at in ascending order chunks = sorted( chunks, key=lambda x: x._hidden_params.get("created_at", float("inf")) ) + print_verbose("Chunks sorted") + # set hidden params from chunk to model_response if model_response is not None and hasattr(model_response, "_hidden_params"): model_response._hidden_params = chunks[0].get("_hidden_params", {}) diff --git a/litellm/tests/test_custom_logger.py b/litellm/tests/test_custom_logger.py index e403c3afe..e1c532a88 100644 --- a/litellm/tests/test_custom_logger.py +++ b/litellm/tests/test_custom_logger.py @@ -206,7 +206,7 @@ def test_azure_completion_stream(): # checks if the model response available in the async + stream callbacks is equal to the received response customHandler2 = MyCustomHandler() litellm.callbacks = [customHandler2] - litellm.set_verbose = False + litellm.set_verbose = True messages = [ {"role": "system", "content": "You are a helpful assistant."}, { From 554f1a090d50d734490df7a2b89acfbee2c052cd Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jan 2024 18:31:07 -0800 Subject: [PATCH 26/27] test(test_keys.py): add delay for test check n --- tests/test_keys.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_keys.py b/tests/test_keys.py index cb06e1f7e..348be63af 100644 --- a/tests/test_keys.py +++ b/tests/test_keys.py @@ -376,7 +376,7 @@ async def test_key_with_budgets(): print(f"hashed_token: {hashed_token}") key_info = await get_key_info(session=session, get_key=key, call_key=key) reset_at_init_value = key_info["info"]["budget_reset_at"] - await asyncio.sleep(15) + await asyncio.sleep(30) key_info = await get_key_info(session=session, get_key=key, call_key=key) reset_at_new_value = key_info["info"]["budget_reset_at"] assert reset_at_init_value != reset_at_new_value From e948b39e3ab545b6502e9159d9415d6c55f91a66 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jan 2024 18:34:13 -0800 Subject: [PATCH 27/27] test(test_streaming.py): fix test to handle none chunk --- litellm/tests/test_streaming.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index d9f99bece..fda640c96 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -847,9 +847,13 @@ def test_sagemaker_weird_response(): logging_obj=logging_obj, ) complete_response = "" - for chunk in response: - print(chunk) - complete_response += chunk["choices"][0]["delta"]["content"] + for idx, chunk in enumerate(response): + # print + chunk, finished = streaming_format_tests(idx, chunk) + has_finish_reason = finished + complete_response += chunk + if finished: + break assert len(complete_response) > 0 except Exception as e: pytest.fail(f"An exception occurred - {str(e)}")