diff --git a/litellm/caching.py b/litellm/caching.py index d73112d21..6f56b5cc3 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -1216,7 +1216,7 @@ class DualCache(BaseCache): self.in_memory_cache.set_cache(key, redis_result[key], **kwargs) for key, value in redis_result.items(): - result[sublist_keys.index(key)] = value + result[keys.index(key)] = value print_verbose(f"async batch get cache: cache result: {result}") return result @@ -1266,7 +1266,6 @@ class DualCache(BaseCache): keys, **kwargs ) - print_verbose(f"in_memory_result: {in_memory_result}") if in_memory_result is not None: result = in_memory_result if None in result and self.redis_cache is not None and local_only == False: @@ -1290,9 +1289,9 @@ class DualCache(BaseCache): key, redis_result[key], **kwargs ) for key, value in redis_result.items(): - result[sublist_keys.index(key)] = value + index = keys.index(key) + result[index] = value - print_verbose(f"async batch get cache: cache result: {result}") return result except Exception as e: traceback.print_exc() diff --git a/litellm/integrations/prometheus_services.py b/litellm/integrations/prometheus_services.py index 4171593ba..90261af07 100644 --- a/litellm/integrations/prometheus_services.py +++ b/litellm/integrations/prometheus_services.py @@ -152,7 +152,6 @@ class PrometheusServicesLogger: if self.mock_testing: self.mock_testing_success_calls += 1 - print(f"LOGS SUCCESSFUL CALL TO PROMETHEUS - payload={payload}") if payload.service.value in self.payload_to_prometheus_map: prom_objects = self.payload_to_prometheus_map[payload.service.value] for obj in prom_objects: diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 0f7c24576..98a59a0af 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -3,8 +3,16 @@ model_list: litellm_params: model: openai/my-fake-model api_key: my-fake-key - # api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/ - api_base: http://0.0.0.0:8080 + api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/ + # api_base: http://0.0.0.0:8080 + stream_timeout: 0.001 + rpm: 10 +- model_name: fake-openai-endpoint + litellm_params: + model: openai/my-fake-model-2 + api_key: my-fake-key + api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/ + # api_base: http://0.0.0.0:8080 stream_timeout: 0.001 rpm: 10 - litellm_params: @@ -33,9 +41,7 @@ litellm_settings: router_settings: routing_strategy: usage-based-routing-v2 - redis_host: os.environ/REDIS_HOST - redis_password: os.environ/REDIS_PASSWORD - redis_port: os.environ/REDIS_PORT + redis_url: "rediss://:073f655645b843c4839329aea8384e68@us1-great-lizard-40486.upstash.io:40486/0" enable_pre_call_checks: True general_settings: diff --git a/litellm/router_strategy/lowest_tpm_rpm_v2.py b/litellm/router_strategy/lowest_tpm_rpm_v2.py index 4f6364c2b..b2b6df42b 100644 --- a/litellm/router_strategy/lowest_tpm_rpm_v2.py +++ b/litellm/router_strategy/lowest_tpm_rpm_v2.py @@ -187,6 +187,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger): request=httpx.Request(method="tpm_rpm_limits", url="https://github.com/BerriAI/litellm"), # type: ignore ), ) + return deployment except Exception as e: if isinstance(e, litellm.RateLimitError): diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index ae7bb4289..2ee789c6f 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -33,6 +33,51 @@ def generate_random_word(length=4): messages = [{"role": "user", "content": "who is ishaan 5222"}] +@pytest.mark.asyncio +async def test_dual_cache_async_batch_get_cache(): + """ + Unit testing for Dual Cache async_batch_get_cache() + - 2 item query + - in_memory result has a partial hit (1/2) + - hit redis for the other -> expect to return None + - expect result = [in_memory_result, None] + """ + from litellm.caching import DualCache, InMemoryCache, RedisCache + + in_memory_cache = InMemoryCache() + redis_cache = RedisCache() # get credentials from environment + dual_cache = DualCache(in_memory_cache=in_memory_cache, redis_cache=redis_cache) + + in_memory_cache.set_cache(key="test_value", value="hello world") + + result = await dual_cache.async_batch_get_cache(keys=["test_value", "test_value_2"]) + + assert result[0] == "hello world" + assert result[1] == None + + +def test_dual_cache_batch_get_cache(): + """ + Unit testing for Dual Cache batch_get_cache() + - 2 item query + - in_memory result has a partial hit (1/2) + - hit redis for the other -> expect to return None + - expect result = [in_memory_result, None] + """ + from litellm.caching import DualCache, InMemoryCache, RedisCache + + in_memory_cache = InMemoryCache() + redis_cache = RedisCache() # get credentials from environment + dual_cache = DualCache(in_memory_cache=in_memory_cache, redis_cache=redis_cache) + + in_memory_cache.set_cache(key="test_value", value="hello world") + + result = dual_cache.batch_get_cache(keys=["test_value", "test_value_2"]) + + assert result[0] == "hello world" + assert result[1] == None + + # @pytest.mark.skip(reason="") def test_caching_dynamic_args(): # test in memory cache try: diff --git a/litellm/tests/test_tpm_rpm_routing_v2.py b/litellm/tests/test_tpm_rpm_routing_v2.py index 4a0256f6a..9a43ae3ca 100644 --- a/litellm/tests/test_tpm_rpm_routing_v2.py +++ b/litellm/tests/test_tpm_rpm_routing_v2.py @@ -23,6 +23,10 @@ from litellm.caching import DualCache ### UNIT TESTS FOR TPM/RPM ROUTING ### +""" +- Given 2 deployments, make sure it's shuffling deployments correctly. +""" + def test_tpm_rpm_updated(): test_cache = DualCache() diff --git a/proxy_server_config.yaml b/proxy_server_config.yaml index f69b7ef1a..dcd5c6855 100644 --- a/proxy_server_config.yaml +++ b/proxy_server_config.yaml @@ -55,6 +55,20 @@ model_list: api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/ stream_timeout: 0.001 rpm: 1 + - model_name: fake-openai-endpoint-3 + litellm_params: + model: openai/my-fake-model + api_key: my-fake-key + api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/ + stream_timeout: 0.001 + rpm: 10 + - model_name: fake-openai-endpoint-3 + litellm_params: + model: openai/my-fake-model-2 + api_key: my-fake-key + api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/ + stream_timeout: 0.001 + rpm: 10 - model_name: "*" litellm_params: model: openai/* diff --git a/tests/test_openai_endpoints.py b/tests/test_openai_endpoints.py index f6bf218ae..465817d83 100644 --- a/tests/test_openai_endpoints.py +++ b/tests/test_openai_endpoints.py @@ -102,6 +102,47 @@ async def chat_completion(session, key, model="gpt-4"): return await response.json() +async def chat_completion_with_headers(session, key, model="gpt-4"): + url = "http://0.0.0.0:4000/chat/completions" + headers = { + "Authorization": f"Bearer {key}", + "Content-Type": "application/json", + } + data = { + "model": model, + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello!"}, + ], + } + + async with session.post(url, headers=headers, json=data) as response: + status = response.status + response_text = await response.text() + + print(response_text) + print() + + if status != 200: + raise Exception(f"Request did not return a 200 status code: {status}") + + response_header_check( + response + ) # calling the function to check response headers + + raw_headers = response.raw_headers + raw_headers_json = {} + + for ( + item + ) in ( + response.raw_headers + ): # ((b'date', b'Fri, 19 Apr 2024 21:17:29 GMT'), (), ) + raw_headers_json[item[0].decode("utf-8")] = item[1].decode("utf-8") + + return raw_headers_json + + async def completion(session, key): url = "http://0.0.0.0:4000/completions" headers = { @@ -222,6 +263,36 @@ async def test_chat_completion_ratelimit(): pass +@pytest.mark.asyncio +async def test_chat_completion_different_deployments(): + """ + - call model group with 2 deployments + - make 5 calls + - expect 2 unique deployments + """ + async with aiohttp.ClientSession() as session: + # key_gen = await generate_key(session=session) + key = "sk-1234" + results = [] + for _ in range(5): + results.append( + await chat_completion_with_headers( + session=session, key=key, model="fake-openai-endpoint-3" + ) + ) + try: + print(f"results: {results}") + init_model_id = results[0]["x-litellm-model-id"] + deployments_shuffled = False + for result in results[1:]: + if init_model_id != result["x-litellm-model-id"]: + deployments_shuffled = True + if deployments_shuffled == False: + pytest.fail("Expected at least 1 shuffled call") + except Exception as e: + pass + + @pytest.mark.asyncio async def test_chat_completion_old_key(): """