feat(lowest_latency.py): route by time to first token, for streaming requests (if available)

Closes https://github.com/BerriAI/litellm/issues/3574
This commit is contained in:
Krrish Dholakia 2024-05-21 13:08:17 -07:00
parent 620e6db027
commit 2b3da449c8
3 changed files with 232 additions and 18 deletions

View file

@ -38,19 +38,17 @@ class CompletionCustomHandler(
# Class variables or attributes
def __init__(self):
self.errors = []
self.states: Optional[
List[
Literal[
"sync_pre_api_call",
"async_pre_api_call",
"post_api_call",
"sync_stream",
"async_stream",
"sync_success",
"async_success",
"sync_failure",
"async_failure",
]
self.states: List[
Literal[
"sync_pre_api_call",
"async_pre_api_call",
"post_api_call",
"sync_stream",
"async_stream",
"sync_success",
"async_success",
"sync_failure",
"async_failure",
]
] = []
@ -269,6 +267,7 @@ class CompletionCustomHandler(
assert isinstance(kwargs["litellm_params"]["api_base"], str)
assert isinstance(kwargs["start_time"], (datetime, type(None)))
assert isinstance(kwargs["stream"], bool)
assert isinstance(kwargs["completion_start_time"], datetime)
assert kwargs["cache_hit"] is None or isinstance(kwargs["cache_hit"], bool)
assert isinstance(kwargs["user"], (str, type(None)))
assert isinstance(kwargs["input"], (list, dict, str))