fix(lowest_tpm_rpm_routing.py): fix parallel rate limit check (#6577)

* fix(lowest_tpm_rpm_routing.py): fix parallel rate limit check * fix(lowest_tpm_rpm_v2.py): return headers in correct format * test: update test * build(deps): bump cookie and express in /docs/my-website (#6566) Bumps [cookie](https://github.com/jshttp/cookie) and [express](https://github.com/expressjs/express). These dependencies needed to be updated together. Updates `cookie` from 0.6.0 to 0.7.1 - [Release notes](https://github.com/jshttp/cookie/releases) - [Commits](https://github.com/jshttp/cookie/compare/v0.6.0...v0.7.1) Updates `express` from 4.20.0 to 4.21.1 - [Release notes](https://github.com/expressjs/express/releases) - [Changelog](https://github.com/expressjs/express/blob/4.21.1/History.md) - [Commits](https://github.com/expressjs/express/compare/4.20.0...4.21.1) --- updated-dependencies: - dependency-name: cookie dependency-type: indirect - dependency-name: express dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * docs(virtual_keys.md): update Dockerfile reference (#6554) Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com> * (proxy fix) - call connect on prisma client when running setup (#6534) * critical fix - call connect on prisma client when running setup * fix test_proxy_server_prisma_setup * fix test_proxy_server_prisma_setup * Add 3.5 haiku (#6588) * feat: add claude-3-5-haiku-20241022 entries * feat: add claude-3-5-haiku-20241022 and vertex_ai/claude-3-5-haiku@20241022 models * add missing entries, remove vision * remove image token costs * Litellm perf improvements 3 (#6573) * perf: move writing key to cache, to background task * perf(litellm_pre_call_utils.py): add otel tracing for pre-call utils adds 200ms on calls with pgdb connected * fix(litellm_pre_call_utils.py'): rename call_type to actual call used * perf(proxy_server.py): remove db logic from _get_config_from_file was causing db calls to occur on every llm request, if team_id was set on key * fix(auth_checks.py): add check for reducing db calls if user/team id does not exist in db reduces latency/call by ~100ms * fix(proxy_server.py): minor fix on existing_settings not incl alerting * fix(exception_mapping_utils.py): map databricks exception string * fix(auth_checks.py): fix auth check logic * test: correctly mark flaky test * fix(utils.py): handle auth token error for tokenizers.from_pretrained * build: fix map * build: fix map * build: fix json for model map * test: remove eol model * fix(proxy_server.py): fix db config loading logic * fix(proxy_server.py): fix order of config / db updates, to ensure fields not overwritten * test: skip test if required env var is missing * test: fix test --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Emmanuel Ferdman <emmanuelferdman@gmail.com> Co-authored-by: Ishaan Jaff <ishaanjaffer0324@gmail.com> Co-authored-by: paul-gauthier <69695708+paul-gauthier@users.noreply.github.com>
2024-11-05 22:03:44 +05:30 · 2024-11-05 22:03:44 +05:30 · 695f48a8f1
commit 695f48a8f1
parent f3071161ad
7 changed files with 148 additions and 64 deletions
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -10,9 +10,20 @@ model_list:
      output_cost_per_token: 0.000015 # 15$/M
      api_base: "https://exampleopenaiendpoint-production.up.railway.app"
      api_key: my-fake-key
-  - model_name: gemini-1.5-flash-002
+  - model_name: fake-openai-endpoint-2
    litellm_params:
-      model: gemini/gemini-1.5-flash-002
+      model: openai/my-fake-model
+      api_key: my-fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+      stream_timeout: 0.001
+      timeout: 1
+      rpm: 1
+  - model_name: fake-openai-endpoint
+    litellm_params:
+      model: openai/my-fake-model
+      api_key: my-fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+

 # litellm_settings:
 #   fallbacks: [{ "claude-3-5-sonnet-20240620": ["claude-3-5-sonnet-aihubmix"] }]
@ -20,47 +31,47 @@ model_list:
 #   default_redis_batch_cache_expiry: 10


-litellm_settings:
-  cache: True
-  cache_params:
-    type: redis
+# litellm_settings:
+#   cache: True
+#   cache_params:
+#     type: redis

-    # disable caching on the actual API call
-    supported_call_types: []
+#     # disable caching on the actual API call
+#     supported_call_types: []

-    # see https://docs.litellm.ai/docs/proxy/prod#3-use-redis-porthost-password-not-redis_url
-    host: os.environ/REDIS_HOST
-    port: os.environ/REDIS_PORT
-    password: os.environ/REDIS_PASSWORD
+#     # see https://docs.litellm.ai/docs/proxy/prod#3-use-redis-porthost-password-not-redis_url
+#     host: os.environ/REDIS_HOST
+#     port: os.environ/REDIS_PORT
+#     password: os.environ/REDIS_PASSWORD

-  # see https://docs.litellm.ai/docs/proxy/caching#turn-on-batch_redis_requests
-  # see https://docs.litellm.ai/docs/proxy/prometheus
-  callbacks: ['otel']
+#   # see https://docs.litellm.ai/docs/proxy/caching#turn-on-batch_redis_requests
+#   # see https://docs.litellm.ai/docs/proxy/prometheus
+#   callbacks: ['otel']


-router_settings:
-  routing_strategy: latency-based-routing
-  routing_strategy_args:
-    # only assign 40% of traffic to the fastest deployment to avoid overloading it
-    lowest_latency_buffer: 0.4
+# # router_settings:
+# #   routing_strategy: latency-based-routing
+# #   routing_strategy_args:
+# #     # only assign 40% of traffic to the fastest deployment to avoid overloading it
+# #     lowest_latency_buffer: 0.4

-    # consider last five minutes of calls for latency calculation
-    ttl: 300
-  redis_host: os.environ/REDIS_HOST
-  redis_port: os.environ/REDIS_PORT
-  redis_password: os.environ/REDIS_PASSWORD
+# #     # consider last five minutes of calls for latency calculation
+# #     ttl: 300
+# #   redis_host: os.environ/REDIS_HOST
+# #   redis_port: os.environ/REDIS_PORT
+# #   redis_password: os.environ/REDIS_PASSWORD
  
-# see https://docs.litellm.ai/docs/proxy/prod#1-use-this-configyaml
-general_settings:
-  master_key: os.environ/LITELLM_MASTER_KEY
-  database_url: os.environ/DATABASE_URL
-  disable_master_key_return: true
-  # alerting: ['slack', 'email']
-  alerting: ['email']
+# # # see https://docs.litellm.ai/docs/proxy/prod#1-use-this-configyaml
+# # general_settings:
+# #   master_key: os.environ/LITELLM_MASTER_KEY
+# #   database_url: os.environ/DATABASE_URL
+# #   disable_master_key_return: true
+# #   # alerting: ['slack', 'email']
+# #   alerting: ['email']

-  # Batch write spend updates every 60s
-  proxy_batch_write_at: 60
+# #   # Batch write spend updates every 60s
+# #   proxy_batch_write_at: 60

-  # see https://docs.litellm.ai/docs/proxy/caching#advanced---user-api-key-cache-ttl
-  # our api keys rarely change
-  user_api_key_cache_ttl: 3600
+# #   # see https://docs.litellm.ai/docs/proxy/caching#advanced---user-api-key-cache-ttl
+# #   # our api keys rarely change
+# #   user_api_key_cache_ttl: 3600
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -757,12 +757,6 @@ async def _PROXY_track_cost_callback(
    verbose_proxy_logger.debug("INSIDE _PROXY_track_cost_callback")
    global prisma_client
    try:
-        # check if it has collected an entire stream response
-        verbose_proxy_logger.debug(
-            "Proxy: In track_cost_callback for: kwargs=%s and completion_response: %s",
-            kwargs,
-            completion_response,
-        )
        verbose_proxy_logger.debug(
            f"kwargs stream: {kwargs.get('stream', None)} + complete streaming response: {kwargs.get('complete_streaming_response', None)}"
        )
@ -1359,7 +1353,7 @@ class ProxyConfig:
    """

    def __init__(self) -> None:
-        pass
+        self.config: Dict[str, Any] = {}

    def is_yaml(self, config_file_path: str) -> bool:
        if not os.path.isfile(config_file_path):
@ -1465,7 +1459,7 @@ class ProxyConfig:
        """

        # load existing config
-        config = await self.get_config()
+        config = self.config

        ## LITELLM MODULE SETTINGS (e.g. litellm.drop_params=True,..)
        litellm_settings = config.get("litellm_settings", {})
@ -1518,7 +1512,9 @@ class ProxyConfig:
            dict: config

        """
+        global prisma_client, store_model_in_db
        # Load existing config
+
        if os.environ.get("LITELLM_CONFIG_BUCKET_NAME") is not None:
            bucket_name = os.environ.get("LITELLM_CONFIG_BUCKET_NAME")
            object_key = os.environ.get("LITELLM_CONFIG_BUCKET_OBJECT_KEY")
@ -1540,12 +1536,21 @@ class ProxyConfig:
        else:
            # default to file
            config = await self._get_config_from_file(config_file_path=config_file_path)
+        ## UPDATE CONFIG WITH DB
+        if prisma_client is not None:
+            config = await self._update_config_from_db(
+                config=config,
+                prisma_client=prisma_client,
+                store_model_in_db=store_model_in_db,
+            )
+
        ## PRINT YAML FOR CONFIRMING IT WORKS
        printed_yaml = copy.deepcopy(config)
        printed_yaml.pop("environment_variables", None)

        config = self._check_for_os_environ_vars(config=config)

+        self.config = config
        return config

    async def load_config(  # noqa: PLR0915
@ -2357,6 +2362,55 @@ class ProxyConfig:
                pass_through_endpoints=general_settings["pass_through_endpoints"]
            )

+    async def _update_config_from_db(
+        self,
+        prisma_client: PrismaClient,
+        config: dict,
+        store_model_in_db: Optional[bool],
+    ):
+
+        if store_model_in_db is not True:
+            verbose_proxy_logger.info(
+                "'store_model_in_db' is not True, skipping db updates"
+            )
+            return config
+
+        _tasks = []
+        keys = [
+            "general_settings",
+            "router_settings",
+            "litellm_settings",
+            "environment_variables",
+        ]
+        for k in keys:
+            response = prisma_client.get_generic_data(
+                key="param_name", value=k, table_name="config"
+            )
+            _tasks.append(response)
+
+        responses = await asyncio.gather(*_tasks)
+        for response in responses:
+            if response is not None:
+                param_name = getattr(response, "param_name", None)
+                verbose_proxy_logger.info(f"loading {param_name} settings from db")
+                if param_name == "litellm_settings":
+                    verbose_proxy_logger.info(
+                        f"litellm_settings: {response.param_value}"
+                    )
+                param_value = getattr(response, "param_value", None)
+                if param_name is not None and param_value is not None:
+                    # check if param_name is already in the config
+                    if param_name in config:
+                        if isinstance(config[param_name], dict):
+                            config[param_name].update(param_value)
+                        else:
+                            config[param_name] = param_value
+                    else:
+                        # if it's not in the config - then add it
+                        config[param_name] = param_value
+
+        return config
+
    async def add_deployment(
        self,
        prisma_client: PrismaClient,
--- a/litellm/router.py
+++ b/litellm/router.py
@ -585,6 +585,7 @@ class Router:
    def routing_strategy_init(
        self, routing_strategy: Union[RoutingStrategy, str], routing_strategy_args: dict
    ):
+        verbose_router_logger.info(f"Routing strategy: {routing_strategy}")
        if (
            routing_strategy == RoutingStrategy.LEAST_BUSY.value
            or routing_strategy == RoutingStrategy.LEAST_BUSY
@ -912,6 +913,7 @@ class Router:
                    logging_obj=logging_obj,
                    parent_otel_span=parent_otel_span,
                )
+
                response = await _response

            ## CHECK CONTENT FILTER ERROR ##
@ -2961,14 +2963,14 @@ class Router:
                raise

            # decides how long to sleep before retry
-            _timeout = self._time_to_sleep_before_retry(
+            retry_after = self._time_to_sleep_before_retry(
                e=original_exception,
                remaining_retries=num_retries,
                num_retries=num_retries,
                healthy_deployments=_healthy_deployments,
            )
-            # sleeps for the length of the timeout
-            await asyncio.sleep(_timeout)
+
+            await asyncio.sleep(retry_after)
            for current_attempt in range(num_retries):
                try:
                    # if the function call is successful, no exception will be raised and we'll break out of the loop
@ -4178,7 +4180,9 @@ class Router:
            model = _model

        ## GET LITELLM MODEL INFO - raises exception, if model is not mapped
-        model_info = litellm.get_model_info(model=model)
+        model_info = litellm.get_model_info(
+            model="{}/{}".format(custom_llm_provider, model)
+        )

        ## CHECK USER SET MODEL INFO
        user_model_info = deployment.get("model_info", {})
@ -4849,7 +4853,7 @@ class Router:
                        )
                        continue
            except Exception as e:
-                verbose_router_logger.error("An error occurs - {}".format(str(e)))
+                verbose_router_logger.exception("An error occurs - {}".format(str(e)))

            _litellm_params = deployment.get("litellm_params", {})
            model_id = deployment.get("model_info", {}).get("id", "")
--- a/litellm/router_strategy/lowest_tpm_rpm_v2.py
+++ b/litellm/router_strategy/lowest_tpm_rpm_v2.py
@ -180,7 +180,6 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
                deployment_rpm = deployment.get("model_info", {}).get("rpm")
            if deployment_rpm is None:
                deployment_rpm = float("inf")
-
            if local_result is not None and local_result >= deployment_rpm:
                raise litellm.RateLimitError(
                    message="Deployment over defined rpm limit={}. current usage={}".format(
@ -195,7 +194,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
                            deployment_rpm,
                            local_result,
                        ),
-                        headers={"retry-after": 60},  # type: ignore
+                        headers={"retry-after": str(60)},  # type: ignore
                        request=httpx.Request(method="tpm_rpm_limits", url="https://github.com/BerriAI/litellm"),  # type: ignore
                    ),
                )
@ -221,7 +220,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
                                deployment_rpm,
                                result,
                            ),
-                            headers={"retry-after": 60},  # type: ignore
+                            headers={"retry-after": str(60)},  # type: ignore
                            request=httpx.Request(method="tpm_rpm_limits", url="https://github.com/BerriAI/litellm"),  # type: ignore
                        ),
                    )
--- a/tests/local_testing/test_router_max_parallel_requests.py
+++ b/tests/local_testing/test_router_max_parallel_requests.py
@ -137,7 +137,7 @@ async def _handle_router_calls(router):
    Nam vitae finibus eros, eu eleifend erat. Maecenas hendrerit magna quis molestie dictum. Ut consequat quam eu massa auctor pulvinar. Pellentesque vitae eros ornare urna accumsan tempor. Maecenas porta id quam at sodales. Donec quis accumsan leo, vel viverra nibh. Vestibulum congue blandit nulla, sed rhoncus libero eleifend ac. In risus lorem, rutrum et tincidunt a, interdum a lectus. Pellentesque aliquet pulvinar mauris, ut ultrices nibh ultricies nec. Mauris mi mauris, facilisis nec metus non, egestas luctus ligula. Quisque ac ligula at felis mollis blandit id nec risus. Nam sollicitudin lacus sed sapien fringilla ullamcorper. Etiam dui quam, posuere sit amet velit id, aliquet molestie ante. Integer cursus eget sapien fringilla elementum. Integer molestie, mi ac scelerisque ultrices, nunc purus condimentum est, in posuere quam nibh vitae velit.
    """
    completion = await router.acompletion(
-        "gpt-4o-2024-08-06",
+        "gpt-3.5-turbo",
        [
            {
                "role": "user",
@ -166,16 +166,17 @@ async def test_max_parallel_requests_rpm_rate_limiting():
        enable_pre_call_checks=True,
        model_list=[
            {
-                "model_name": "gpt-4o-2024-08-06",
+                "model_name": "gpt-3.5-turbo",
                "litellm_params": {
-                    "model": "gpt-4o-2024-08-06",
+                    "model": "gpt-3.5-turbo",
                    "temperature": 0.0,
-                    "rpm": 5,
+                    "rpm": 1,
+                    "num_retries": 3,
                },
            }
        ],
    )
-    await asyncio.gather(*[_handle_router_calls(router) for _ in range(16)])
+    await asyncio.gather(*[_handle_router_calls(router) for _ in range(3)])


@pytest.mark.asyncio
--- a/tests/test_openai_endpoints.py
+++ b/tests/test_openai_endpoints.py
@ -5,7 +5,7 @@ import asyncio
 import aiohttp, openai
 from openai import OpenAI, AsyncOpenAI
 from typing import Optional, List, Union
-
+import uuid

 LITELLM_MASTER_KEY = "sk-1234"

@ -107,7 +107,7 @@ async def chat_completion(session, key, model: Union[str, List] = "gpt-4"):
        "model": model,
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "Hello!"},
+            {"role": "user", "content": f"Hello! {uuid.uuid4()}"},
        ],
    }

@ -296,7 +296,6 @@ async def test_chat_completion():
        await chat_completion(session=session, key=key_2)


-# @pytest.mark.skip(reason="Local test. Proxy not concurrency safe yet. WIP.")
@pytest.mark.asyncio
 async def test_chat_completion_ratelimit():
    """
--- a/tests/test_team_logging.py
+++ b/tests/test_team_logging.py
@ -110,6 +110,7 @@ async def test_team_logging():
        pytest.fail(f"Unexpected error: {str(e)}")


+@pytest.mark.skip(reason="todo fix langfuse credential error")
@pytest.mark.asyncio
 async def test_team_2logging():
    """
@ -118,6 +119,20 @@ async def test_team_2logging():
    -> Make chat/completions call
    -> Fetch logs from langfuse
    """
+    langfuse_public_key = os.getenv("LANGFUSE_PROJECT2_PUBLIC")
+
+    print(f"langfuse_public_key: {langfuse_public_key}")
+    langfuse_secret_key = os.getenv("LANGFUSE_PROJECT2_SECRET")
+    print(f"langfuse_secret_key: {langfuse_secret_key}")
+    langfuse_host = "https://us.cloud.langfuse.com"
+
+    try:
+        assert langfuse_public_key is not None
+        assert langfuse_secret_key is not None
+    except Exception as e:
+        # skip test if langfuse credentials are not set
+        return
+
    try:
        async with aiohttp.ClientSession() as session:

@ -143,8 +158,9 @@ async def test_team_2logging():
            import langfuse

            langfuse_client = langfuse.Langfuse(
-                public_key=os.getenv("LANGFUSE_PROJECT2_PUBLIC"),
-                secret_key=os.getenv("LANGFUSE_PROJECT2_SECRET"),
+                public_key=langfuse_public_key,
+                secret_key=langfuse_secret_key,
+                host=langfuse_host,
            )

            await asyncio.sleep(10)