diff --git a/litellm/integrations/prometheus_services.py b/litellm/integrations/prometheus_services.py index 45f70a8c1..0249a71d0 100644 --- a/litellm/integrations/prometheus_services.py +++ b/litellm/integrations/prometheus_services.py @@ -129,8 +129,6 @@ class PrometheusServicesLogger: if self.mock_testing: self.mock_testing_success_calls += 1 - print(f"payload call type: {payload.call_type}") - if payload.service.value in self.payload_to_prometheus_map: prom_objects = self.payload_to_prometheus_map[payload.service.value] for obj in prom_objects: @@ -151,8 +149,6 @@ class PrometheusServicesLogger: if self.mock_testing: self.mock_testing_failure_calls += 1 - print(f"payload call type: {payload.call_type}") - if payload.service.value in self.payload_to_prometheus_map: prom_objects = self.payload_to_prometheus_map[payload.service.value] for obj in prom_objects: @@ -170,8 +166,6 @@ class PrometheusServicesLogger: if self.mock_testing: self.mock_testing_success_calls += 1 - print(f"payload call type: {payload.call_type}") - if payload.service.value in self.payload_to_prometheus_map: prom_objects = self.payload_to_prometheus_map[payload.service.value] for obj in prom_objects: @@ -193,8 +187,6 @@ class PrometheusServicesLogger: if self.mock_testing: self.mock_testing_failure_calls += 1 - print(f"payload call type: {payload.call_type}") - if payload.service.value in self.payload_to_prometheus_map: prom_objects = self.payload_to_prometheus_map[payload.service.value] for obj in prom_objects: diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 4f13fa00e..2cbbb25ee 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -21,10 +21,10 @@ model_list: api_version: "2023-07-01-preview" stream_timeout: 0.001 model_name: azure-gpt-3.5 -# - model_name: text-embedding-ada-002 -# litellm_params: -# model: text-embedding-ada-002 -# api_key: os.environ/OPENAI_API_KEY +- model_name: text-embedding-ada-002 + litellm_params: + model: text-embedding-ada-002 + api_key: os.environ/OPENAI_API_KEY - model_name: gpt-instruct litellm_params: model: text-completion-openai/gpt-3.5-turbo-instruct @@ -45,6 +45,9 @@ litellm_settings: success_callback: ["prometheus"] failure_callback: ["prometheus"] service_callback: ["prometheus_system"] + cache: True + cache_params: + type: "redis" general_settings: diff --git a/litellm/tests/test_amazing_vertex_completion.py b/litellm/tests/test_amazing_vertex_completion.py index 638295ba6..44343e83a 100644 --- a/litellm/tests/test_amazing_vertex_completion.py +++ b/litellm/tests/test_amazing_vertex_completion.py @@ -578,8 +578,10 @@ def test_gemini_pro_function_calling(): model="gemini-pro", messages=messages, tools=tools, tool_choice="auto" ) print(f"completion: {completion}") - assert completion.choices[0].message.content is None - assert len(completion.choices[0].message.tool_calls) == 1 + if hasattr(completion.choices[0].message, "tool_calls") and isinstance( + completion.choices[0].message.tool_calls, list + ): + assert len(completion.choices[0].message.tool_calls) == 1 try: load_vertex_ai_credentials() tools = [ diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 16f1b3380..903ce69c7 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -178,32 +178,61 @@ def test_caching_with_default_ttl(): pytest.fail(f"Error occurred: {e}") -def test_caching_with_cache_controls(): +@pytest.mark.parametrize( + "sync_flag", + [True, False], +) +@pytest.mark.asyncio +async def test_caching_with_cache_controls(sync_flag): try: litellm.set_verbose = True litellm.cache = Cache() message = [{"role": "user", "content": f"Hey, how's it going? {uuid.uuid4()}"}] - ## TTL = 0 - response1 = completion( - model="gpt-3.5-turbo", messages=messages, cache={"ttl": 0} - ) - response2 = completion( - model="gpt-3.5-turbo", messages=messages, cache={"s-maxage": 10} - ) - print(f"response1: {response1}") - print(f"response2: {response2}") - assert response2["id"] != response1["id"] + if sync_flag: + ## TTL = 0 + response1 = completion( + model="gpt-3.5-turbo", messages=messages, cache={"ttl": 0} + ) + response2 = completion( + model="gpt-3.5-turbo", messages=messages, cache={"s-maxage": 10} + ) + + assert response2["id"] != response1["id"] + else: + ## TTL = 0 + response1 = await litellm.acompletion( + model="gpt-3.5-turbo", messages=messages, cache={"ttl": 0} + ) + await asyncio.sleep(10) + response2 = await litellm.acompletion( + model="gpt-3.5-turbo", messages=messages, cache={"s-maxage": 10} + ) + + assert response2["id"] != response1["id"] + message = [{"role": "user", "content": f"Hey, how's it going? {uuid.uuid4()}"}] ## TTL = 5 - response1 = completion( - model="gpt-3.5-turbo", messages=messages, cache={"ttl": 5} - ) - response2 = completion( - model="gpt-3.5-turbo", messages=messages, cache={"s-maxage": 5} - ) - print(f"response1: {response1}") - print(f"response2: {response2}") - assert response2["id"] == response1["id"] + if sync_flag: + response1 = completion( + model="gpt-3.5-turbo", messages=messages, cache={"ttl": 5} + ) + response2 = completion( + model="gpt-3.5-turbo", messages=messages, cache={"s-maxage": 5} + ) + print(f"response1: {response1}") + print(f"response2: {response2}") + assert response2["id"] == response1["id"] + else: + response1 = await litellm.acompletion( + model="gpt-3.5-turbo", messages=messages, cache={"ttl": 25} + ) + await asyncio.sleep(10) + response2 = await litellm.acompletion( + model="gpt-3.5-turbo", messages=messages, cache={"s-maxage": 25} + ) + print(f"response1: {response1}") + print(f"response2: {response2}") + assert response2["id"] == response1["id"] except Exception as e: print(f"error occurred: {traceback.format_exc()}") pytest.fail(f"Error occurred: {e}") @@ -1111,6 +1140,7 @@ async def test_cache_control_overrides(): "content": "hello who are you" + unique_num, } ], + caching=True, ) print(response1) @@ -1125,6 +1155,55 @@ async def test_cache_control_overrides(): "content": "hello who are you" + unique_num, } ], + caching=True, + cache={"no-cache": True}, + ) + + print(response2) + + assert response1.id != response2.id + + +def test_sync_cache_control_overrides(): + # we use the cache controls to ensure there is no cache hit on this test + litellm.cache = Cache( + type="redis", + host=os.environ["REDIS_HOST"], + port=os.environ["REDIS_PORT"], + password=os.environ["REDIS_PASSWORD"], + ) + print("Testing cache override") + litellm.set_verbose = True + import uuid + + unique_num = str(uuid.uuid4()) + + start_time = time.time() + + response1 = litellm.completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": "hello who are you" + unique_num, + } + ], + caching=True, + ) + + print(response1) + + time.sleep(2) + + response2 = litellm.completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": "hello who are you" + unique_num, + } + ], + caching=True, cache={"no-cache": True}, ) diff --git a/litellm/utils.py b/litellm/utils.py index 4beeaaed1..2d0b5cc96 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2716,23 +2716,22 @@ def client(original_function): # [OPTIONAL] CHECK CACHE print_verbose( - f"kwargs[caching]: {kwargs.get('caching', False)}; litellm.cache: {litellm.cache}" + f"SYNC kwargs[caching]: {kwargs.get('caching', False)}; litellm.cache: {litellm.cache}; kwargs.get('cache')['no-cache']: {kwargs.get('cache', {}).get('no-cache', False)}" ) # if caching is false or cache["no-cache"]==True, don't run this if ( ( ( - kwargs.get("caching", None) is None - and kwargs.get("cache", None) is None - and litellm.cache is not None - ) - or kwargs.get("caching", False) == True - or ( - kwargs.get("cache", None) is not None - and kwargs.get("cache", {}).get("no-cache", False) != True + ( + kwargs.get("caching", None) is None + and litellm.cache is not None + ) + or kwargs.get("caching", False) == True ) + and kwargs.get("cache", {}).get("no-cache", False) != True ) and kwargs.get("aembedding", False) != True + and kwargs.get("atext_completion", False) != True and kwargs.get("acompletion", False) != True and kwargs.get("aimg_generation", False) != True and kwargs.get("atranscription", False) != True @@ -3011,24 +3010,17 @@ def client(original_function): ) # [OPTIONAL] CHECK CACHE - print_verbose(f"litellm.cache: {litellm.cache}") print_verbose( - f"kwargs[caching]: {kwargs.get('caching', False)}; litellm.cache: {litellm.cache}" + f"ASYNC kwargs[caching]: {kwargs.get('caching', False)}; litellm.cache: {litellm.cache}; kwargs.get('cache'): {kwargs.get('cache', None)}" ) # if caching is false, don't run this final_embedding_cached_response = None if ( - ( - kwargs.get("caching", None) is None - and kwargs.get("cache", None) is None - and litellm.cache is not None - ) + (kwargs.get("caching", None) is None and litellm.cache is not None) or kwargs.get("caching", False) == True - or ( - kwargs.get("cache", None) is not None - and kwargs.get("cache").get("no-cache", False) != True - ) + ) and ( + kwargs.get("cache", {}).get("no-cache", False) != True ): # allow users to control returning cached responses from the completion function # checking cache print_verbose("INSIDE CHECKING CACHE") @@ -3074,7 +3066,6 @@ def client(original_function): preset_cache_key # for streaming calls, we need to pass the preset_cache_key ) cached_result = litellm.cache.get_cache(*args, **kwargs) - if cached_result is not None and not isinstance( cached_result, list ):