docs(scheduler.md): add request prioritization to docs

2025-04-27 03:34:10 +00:00 · 2024-05-31 19:35:47 -07:00 · 2024-05-31 19:35:47 -07:00 · f8d4be710e
commit f8d4be710e
parent 79287a7584
6 changed files with 205 additions and 53 deletions
--- a/litellm/tests/test_scheduler.py
+++ b/litellm/tests/test_scheduler.py
@ -9,11 +9,11 @@ sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 from litellm import Router
-from litellm.proxy.queue.scheduler import FlowItem, Scheduler
+from litellm.scheduler import FlowItem, Scheduler


@pytest.mark.asyncio
-async def test_scheduler_diff_model_groups():
+async def test_scheduler_diff_model_names():
    """
    Assert 2 requests to 2 diff model groups are top of their respective queue's
    """
@ -33,13 +33,13 @@ async def test_scheduler_diff_model_groups():

    scheduler.update_variables(llm_router=router)

-    item1 = FlowItem(priority=0, request_id="10", model_group="gpt-3.5-turbo")
-    item2 = FlowItem(priority=0, request_id="11", model_group="gpt-4")
+    item1 = FlowItem(priority=0, request_id="10", model_name="gpt-3.5-turbo")
+    item2 = FlowItem(priority=0, request_id="11", model_name="gpt-4")
    await scheduler.add_request(item1)
    await scheduler.add_request(item2)

-    assert await scheduler.poll(id="10", model_group="gpt-3.5-turbo") == True
-    assert await scheduler.poll(id="11", model_group="gpt-4") == True
+    assert await scheduler.poll(id="10", model_name="gpt-3.5-turbo") == True
+    assert await scheduler.poll(id="11", model_name="gpt-4") == True


@pytest.mark.parametrize("p0, p1", [(0, 0), (0, 1), (1, 0)])
@ -64,17 +64,17 @@ async def test_scheduler_prioritized_requests(p0, p1):

    scheduler.update_variables(llm_router=router)

-    item1 = FlowItem(priority=p0, request_id="10", model_group="gpt-3.5-turbo")
-    item2 = FlowItem(priority=p1, request_id="11", model_group="gpt-3.5-turbo")
+    item1 = FlowItem(priority=p0, request_id="10", model_name="gpt-3.5-turbo")
+    item2 = FlowItem(priority=p1, request_id="11", model_name="gpt-3.5-turbo")
    await scheduler.add_request(item1)
    await scheduler.add_request(item2)

    if p0 == 0:
-        assert await scheduler.peek(id="10", model_group="gpt-3.5-turbo") == True
-        assert await scheduler.peek(id="11", model_group="gpt-3.5-turbo") == False
+        assert await scheduler.peek(id="10", model_name="gpt-3.5-turbo") == True
+        assert await scheduler.peek(id="11", model_name="gpt-3.5-turbo") == False
    else:
-        assert await scheduler.peek(id="11", model_group="gpt-3.5-turbo") == True
-        assert await scheduler.peek(id="10", model_group="gpt-3.5-turbo") == False
+        assert await scheduler.peek(id="11", model_name="gpt-3.5-turbo") == True
+        assert await scheduler.peek(id="10", model_name="gpt-3.5-turbo") == False


@pytest.mark.parametrize("p0, p1", [(0, 0), (0, 1), (1, 0)])
@ -92,10 +92,12 @@ async def test_scheduler_prioritized_requests_mock_response(p0, p1):
                "litellm_params": {
                    "model": "gpt-3.5-turbo",
                    "mock_response": "Hello world this is Macintosh!",
+                    "rpm": 1,
                },
            },
        ],
        timeout=2,
+        routing_strategy="usage-based-routing-v2",
    )

    scheduler.update_variables(llm_router=router)
@ -114,7 +116,7 @@ async def test_scheduler_prioritized_requests_mock_response(p0, p1):

        while curr_time < end_time:
            make_request = await scheduler.poll(
-                id=flow_item.request_id, model_group=flow_item.model_group
+                id=flow_item.request_id, model_name=flow_item.model_name
            )
            if make_request:  ## IF TRUE -> MAKE REQUEST
                break
@ -123,10 +125,13 @@ async def test_scheduler_prioritized_requests_mock_response(p0, p1):
                curr_time = time.time()

        if make_request:
-            _response = await router.acompletion(
-                model=flow_item.model_group,
-                messages=[{"role": "user", "content": "Hey!"}],
-            )
+            try:
+                _response = await router.acompletion(
+                    model=flow_item.model_name,
+                    messages=[{"role": "user", "content": "Hey!"}],
+                )
+            except Exception as e:
+                return flow_item.priority, flow_item.request_id, "Error occurred"

            return flow_item.priority, flow_item.request_id, time.time()

@ -135,13 +140,13 @@ async def test_scheduler_prioritized_requests_mock_response(p0, p1):
    tasks = []

    item = FlowItem(
-        priority=p0, request_id=str(uuid.uuid4()), model_group="gpt-3.5-turbo"
+        priority=p0, request_id=str(uuid.uuid4()), model_name="gpt-3.5-turbo"
    )
    await scheduler.add_request(request=item)
    tasks.append(_make_prioritized_call(flow_item=item))

    item = FlowItem(
-        priority=p1, request_id=str(uuid.uuid4()), model_group="gpt-3.5-turbo"
+        priority=p1, request_id=str(uuid.uuid4()), model_name="gpt-3.5-turbo"
    )
    await scheduler.add_request(request=item)
    tasks.append(_make_prioritized_call(flow_item=item))
@ -157,8 +162,4 @@ async def test_scheduler_prioritized_requests_mock_response(p0, p1):
    assert (
        completed_responses[0][0] == 0
    )  # assert higher priority request got done first
-    assert (
-        completed_responses[0][2] < completed_responses[1][2]
-    ), "1st response time={}, 2nd response time={}".format(
-        completed_responses[0][1], completed_responses[1][1]
-    )  # assert higher priority request got done first
+    assert isinstance(completed_responses[1][2], str)  # 2nd request errored out