forked from phoenix/litellm-mirror
feat(router.py): adding latency-based routing strategy
This commit is contained in:
parent
2f3e13e43b
commit
76f46902ed
2 changed files with 156 additions and 90 deletions
|
@ -46,11 +46,14 @@ class Router:
|
|||
num_retries: int = 0,
|
||||
timeout: float = 600,
|
||||
default_litellm_params = {}, # default params for Router.chat.completion.create
|
||||
routing_strategy: Literal["simple-shuffle", "least-busy", "usage-based-routing"] = "simple-shuffle") -> None:
|
||||
routing_strategy: Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing"] = "simple-shuffle") -> None:
|
||||
|
||||
if model_list:
|
||||
self.set_model_list(model_list)
|
||||
self.healthy_deployments: List = self.model_list
|
||||
self.deployment_latency_map = {}
|
||||
for m in model_list:
|
||||
self.deployment_latency_map[m["litellm_params"]["model"]] = 0
|
||||
|
||||
self.num_retries = num_retries
|
||||
|
||||
|
@ -122,6 +125,22 @@ class Router:
|
|||
except Exception as e:
|
||||
pass
|
||||
return healthy_deployments
|
||||
|
||||
def weighted_shuffle_by_latency(self, items):
|
||||
# Sort the items by latency
|
||||
sorted_items = sorted(items, key=lambda x: x[1])
|
||||
# Get only the latencies
|
||||
latencies = [i[1] for i in sorted_items]
|
||||
# Calculate the sum of all latencies
|
||||
total_latency = sum(latencies)
|
||||
# Calculate the weight for each latency (lower latency = higher weight)
|
||||
weights = [total_latency-latency for latency in latencies]
|
||||
# Get a weighted random item
|
||||
if sum(weights) == 0:
|
||||
chosen_item = random.choice(sorted_items)[0]
|
||||
else:
|
||||
chosen_item = random.choices(sorted_items, weights=weights, k=1)[0][0]
|
||||
return chosen_item
|
||||
|
||||
def set_model_list(self, model_list: list):
|
||||
self.model_list = model_list
|
||||
|
@ -155,6 +174,21 @@ class Router:
|
|||
potential_deployments.append(item)
|
||||
item = random.choice(potential_deployments)
|
||||
return item or item[0]
|
||||
elif self.routing_strategy == "latency-based-routing":
|
||||
returned_item = None
|
||||
lowest_latency = float('inf')
|
||||
### get potential deployments
|
||||
potential_deployments = []
|
||||
for item in self.model_list:
|
||||
if item["model_name"] == model:
|
||||
potential_deployments.append(item)
|
||||
### shuffles with priority for lowest latency
|
||||
# items_with_latencies = [('A', 10), ('B', 20), ('C', 30), ('D', 40)]
|
||||
items_with_latencies = []
|
||||
for item in potential_deployments:
|
||||
items_with_latencies.append((item, self.deployment_latency_map[item["litellm_params"]["model"]]))
|
||||
returned_item = self.weighted_shuffle_by_latency(items_with_latencies)
|
||||
return returned_item
|
||||
elif self.routing_strategy == "usage-based-routing":
|
||||
return self.get_usage_based_available_deployment(model=model, messages=messages, input=input)
|
||||
|
||||
|
@ -238,14 +272,23 @@ class Router:
|
|||
Example usage:
|
||||
response = router.completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}]
|
||||
"""
|
||||
|
||||
# pick the one that is available (lowest TPM/RPM)
|
||||
deployment = self.get_available_deployment(model=model, messages=messages)
|
||||
data = deployment["litellm_params"]
|
||||
for k, v in self.default_litellm_params.items():
|
||||
if k not in data: # prioritize model-specific params > default router params
|
||||
data[k] = v
|
||||
return litellm.completion(**{**data, "messages": messages, "caching": self.cache_responses, **kwargs})
|
||||
try:
|
||||
# pick the one that is available (lowest TPM/RPM)
|
||||
deployment = self.get_available_deployment(model=model, messages=messages)
|
||||
data = deployment["litellm_params"]
|
||||
for k, v in self.default_litellm_params.items():
|
||||
if k not in data: # prioritize model-specific params > default router params
|
||||
data[k] = v
|
||||
return litellm.completion(**{**data, "messages": messages, "caching": self.cache_responses, **kwargs})
|
||||
except Exception as e:
|
||||
if self.num_retries > 0:
|
||||
kwargs["model"] = model
|
||||
kwargs["messages"] = messages
|
||||
kwargs["original_exception"] = e
|
||||
kwargs["original_function"] = self.completion
|
||||
return self.function_with_retries(**kwargs)
|
||||
else:
|
||||
raise e
|
||||
|
||||
|
||||
async def acompletion(self,
|
||||
|
@ -261,9 +304,6 @@ class Router:
|
|||
if k not in data: # prioritize model-specific params > default router params
|
||||
data[k] = v
|
||||
response = await litellm.acompletion(**{**data, "messages": messages, "caching": self.cache_responses, **kwargs})
|
||||
# client = AsyncOpenAI()
|
||||
# print(f"MAKING OPENAI CALL")
|
||||
# response = await client.chat.completions.create(model=model, messages=messages)
|
||||
return response
|
||||
except Exception as e:
|
||||
if self.num_retries > 0:
|
||||
|
@ -282,17 +322,26 @@ class Router:
|
|||
is_fallback: Optional[bool] = False,
|
||||
is_async: Optional[bool] = False,
|
||||
**kwargs):
|
||||
try:
|
||||
messages=[{"role": "user", "content": prompt}]
|
||||
# pick the one that is available (lowest TPM/RPM)
|
||||
deployment = self.get_available_deployment(model=model, messages=messages)
|
||||
|
||||
messages=[{"role": "user", "content": prompt}]
|
||||
# pick the one that is available (lowest TPM/RPM)
|
||||
deployment = self.get_available_deployment(model=model, messages=messages)
|
||||
|
||||
data = deployment["litellm_params"]
|
||||
for k, v in self.default_litellm_params.items():
|
||||
if k not in data: # prioritize model-specific params > default router params
|
||||
data[k] = v
|
||||
# call via litellm.completion()
|
||||
return litellm.text_completion(**{**data, "prompt": prompt, "caching": self.cache_responses, **kwargs}) # type: ignore
|
||||
data = deployment["litellm_params"]
|
||||
for k, v in self.default_litellm_params.items():
|
||||
if k not in data: # prioritize model-specific params > default router params
|
||||
data[k] = v
|
||||
# call via litellm.completion()
|
||||
return litellm.text_completion(**{**data, "prompt": prompt, "caching": self.cache_responses, **kwargs}) # type: ignore
|
||||
except Exception as e:
|
||||
if self.num_retries > 0:
|
||||
kwargs["model"] = model
|
||||
kwargs["messages"] = messages
|
||||
kwargs["original_exception"] = e
|
||||
kwargs["original_function"] = self.completion
|
||||
return self.function_with_retries(**kwargs)
|
||||
else:
|
||||
raise e
|
||||
|
||||
def embedding(self,
|
||||
model: str,
|
||||
|
@ -344,6 +393,20 @@ class Router:
|
|||
else:
|
||||
total_tokens = completion_response['usage']['total_tokens']
|
||||
self._set_deployment_usage(model_name, total_tokens)
|
||||
|
||||
self.deployment_latency_map[model_name] = (end_time - start_time).total_seconds()
|
||||
|
||||
def deployment_callback_on_failure(
|
||||
self,
|
||||
kwargs, # kwargs to completion
|
||||
completion_response, # response from completion
|
||||
start_time, end_time # start/end time
|
||||
):
|
||||
model_name = kwargs.get('model', None) # i.e. gpt35turbo
|
||||
custom_llm_provider = kwargs.get("litellm_params", {}).get('custom_llm_provider', None) # i.e. azure
|
||||
if custom_llm_provider:
|
||||
model_name = f"{custom_llm_provider}/{model_name}"
|
||||
self.deployment_latency_map[model_name] = float('inf')
|
||||
|
||||
def get_usage_based_available_deployment(self,
|
||||
model: str,
|
||||
|
|
|
@ -56,85 +56,87 @@ load_dotenv()
|
|||
# test_openai_only()
|
||||
|
||||
|
||||
# def test_multiple_deployments():
|
||||
# import concurrent
|
||||
# # litellm.set_verbose=True
|
||||
# futures = {}
|
||||
# model_list = [{ # list of model deployments
|
||||
# "model_name": "gpt-3.5-turbo", # openai model name
|
||||
# "litellm_params": { # params for litellm completion/embedding call
|
||||
# "model": "azure/chatgpt-v-2",
|
||||
# "api_key": os.getenv("AZURE_API_KEY"),
|
||||
# "api_version": os.getenv("AZURE_API_VERSION"),
|
||||
# "api_base": os.getenv("AZURE_API_BASE")
|
||||
# },
|
||||
# "tpm": 240000,
|
||||
# "rpm": 1800
|
||||
# }, {
|
||||
# "model_name": "gpt-3.5-turbo", # openai model name
|
||||
# "litellm_params": { # params for litellm completion/embedding call
|
||||
# "model": "azure/chatgpt-functioncalling",
|
||||
# "api_key": os.getenv("AZURE_API_KEY"),
|
||||
# "api_version": os.getenv("AZURE_API_VERSION"),
|
||||
# "api_base": os.getenv("AZURE_API_BASE")
|
||||
# },
|
||||
# "tpm": 240000,
|
||||
# "rpm": 1800
|
||||
# }, {
|
||||
# "model_name": "gpt-3.5-turbo", # openai model name
|
||||
# "litellm_params": { # params for litellm completion/embedding call
|
||||
# "model": "gpt-3.5-turbo",
|
||||
# "api_key": os.getenv("OPENAI_API_KEY"),
|
||||
# },
|
||||
# "tpm": 1000000,
|
||||
# "rpm": 9000
|
||||
# }]
|
||||
def test_multiple_deployments():
|
||||
import concurrent, time
|
||||
# litellm.set_verbose=True
|
||||
futures = {}
|
||||
model_list = [{ # list of model deployments
|
||||
"model_name": "gpt-3.5-turbo", # openai model name
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE")
|
||||
},
|
||||
"tpm": 240000,
|
||||
"rpm": 1800
|
||||
}, {
|
||||
"model_name": "gpt-3.5-turbo", # openai model name
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-functioncalling",
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE")
|
||||
},
|
||||
"tpm": 240000,
|
||||
"rpm": 1800
|
||||
}, {
|
||||
"model_name": "gpt-3.5-turbo", # openai model name
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "gpt-3.5-turbo",
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
},
|
||||
"tpm": 1000000,
|
||||
"rpm": 9000
|
||||
}]
|
||||
|
||||
# router = Router(model_list=model_list, redis_host=os.getenv("REDIS_HOST"), redis_password=os.getenv("REDIS_PASSWORD"), redis_port=int(os.getenv("REDIS_PORT"))) # type: ignore
|
||||
router = Router(model_list=model_list, redis_host=os.getenv("REDIS_HOST"), redis_password=os.getenv("REDIS_PASSWORD"), redis_port=int(os.getenv("REDIS_PORT")), routing_strategy="latency-based-routing") # type: ignore
|
||||
|
||||
# results = []
|
||||
# with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
# kwargs = {
|
||||
# "model": "gpt-3.5-turbo",
|
||||
# "messages": [{"role": "user", "content": """Context:
|
||||
results = []
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
kwargs = {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [{"role": "user", "content": """Context:
|
||||
|
||||
# In the historical era of Ancient Greece, a multitude of significant individuals lived, contributing immensely to various disciplines like science, politics, philosophy, and literature. For instance, Socrates, a renowned philosopher, primarily focused on ethics. His notable method, the Socratic Method, involved acknowledging one's own ignorance to stimulate critical thinking and illuminate ideas. His student, Plato, another prominent figure, founded the Academy in Athens. He proposed theories on justice, beauty, and equality, and also introduced the theory of forms, which is pivotal to understanding his philosophical insights. Another student of Socrates, Xenophon, distinguished himself more in the domain of history and military affairs.
|
||||
In the historical era of Ancient Greece, a multitude of significant individuals lived, contributing immensely to various disciplines like science, politics, philosophy, and literature. For instance, Socrates, a renowned philosopher, primarily focused on ethics. His notable method, the Socratic Method, involved acknowledging one's own ignorance to stimulate critical thinking and illuminate ideas. His student, Plato, another prominent figure, founded the Academy in Athens. He proposed theories on justice, beauty, and equality, and also introduced the theory of forms, which is pivotal to understanding his philosophical insights. Another student of Socrates, Xenophon, distinguished himself more in the domain of history and military affairs.
|
||||
|
||||
# Aristotle, who studied under Plato, led an equally remarkable life. His extensive works have been influential across various domains, including science, logic, metaphysics, ethics, and politics. Perhaps most notably, a substantial portion of the Western intellectual tradition traces back to his writings. He later tutored Alexander the Great who went on to create one of the most vast empires in the world.
|
||||
Aristotle, who studied under Plato, led an equally remarkable life. His extensive works have been influential across various domains, including science, logic, metaphysics, ethics, and politics. Perhaps most notably, a substantial portion of the Western intellectual tradition traces back to his writings. He later tutored Alexander the Great who went on to create one of the most vast empires in the world.
|
||||
|
||||
# In the domain of mathematics, Pythagoras and Euclid made significant contributions. Pythagoras is best known for the Pythagorean theorem, a fundamental principle in geometry, while Euclid, often regarded as the father of geometry, wrote "The Elements", a collection of definitions, axioms, theorems, and proofs.
|
||||
In the domain of mathematics, Pythagoras and Euclid made significant contributions. Pythagoras is best known for the Pythagorean theorem, a fundamental principle in geometry, while Euclid, often regarded as the father of geometry, wrote "The Elements", a collection of definitions, axioms, theorems, and proofs.
|
||||
|
||||
# Apart from these luminaries, the period also saw a number of influential political figures. Pericles, a prominent and influential Greek statesman, orator, and general of Athens during the Golden Age, specifically between the Persian and Peloponnesian wars, played a significant role in developing the Athenian democracy.
|
||||
Apart from these luminaries, the period also saw a number of influential political figures. Pericles, a prominent and influential Greek statesman, orator, and general of Athens during the Golden Age, specifically between the Persian and Peloponnesian wars, played a significant role in developing the Athenian democracy.
|
||||
|
||||
# The Ancient Greek era also witnessed extraordinary advancements in arts and literature. Homer, credited with the creation of the epic poems 'The Iliad' and 'The Odyssey,' is considered one of the greatest poets in history. The tragedies of Sophocles, Aeschylus, and Euripides left an indelible mark on the field of drama, and the comedies of Aristophanes remain influential even today.
|
||||
The Ancient Greek era also witnessed extraordinary advancements in arts and literature. Homer, credited with the creation of the epic poems 'The Iliad' and 'The Odyssey,' is considered one of the greatest poets in history. The tragedies of Sophocles, Aeschylus, and Euripides left an indelible mark on the field of drama, and the comedies of Aristophanes remain influential even today.
|
||||
|
||||
# ---
|
||||
# Question:
|
||||
---
|
||||
Question:
|
||||
|
||||
# Who among the mentioned figures from Ancient Greece contributed to the domain of mathematics and what are their significant contributions?"""}],
|
||||
# }
|
||||
# for _ in range(10):
|
||||
# future = executor.submit(router.completion, **kwargs)
|
||||
# futures[future] = future
|
||||
Who among the mentioned figures from Ancient Greece contributed to the domain of mathematics and what are their significant contributions?"""}],
|
||||
}
|
||||
|
||||
# # Retrieve the results from the futures
|
||||
# while futures:
|
||||
# done, not_done = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_COMPLETED)
|
||||
# for future in done:
|
||||
# try:
|
||||
# result = future.result()
|
||||
# print(f"result: {result}")
|
||||
# results.append(result)
|
||||
# del futures[future]
|
||||
# except Exception as e:
|
||||
# print(f"Exception: {e}; traceback: {traceback.format_exc()}")
|
||||
# del futures[future] # remove the done future
|
||||
start_time = time.time()
|
||||
for _ in range(1000):
|
||||
future = executor.submit(router.completion, **kwargs)
|
||||
futures[future] = future
|
||||
|
||||
# # Check results
|
||||
# print(results)
|
||||
# Retrieve the results from the futures
|
||||
while futures:
|
||||
done, not_done = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_COMPLETED)
|
||||
for future in done:
|
||||
try:
|
||||
result = future.result()
|
||||
results.append(result)
|
||||
del futures[future]
|
||||
except Exception as e:
|
||||
print(f"Exception: {e}; traceback: {traceback.format_exc()}")
|
||||
del futures[future] # remove the done future
|
||||
|
||||
end_time = time.time()
|
||||
print(f"ELAPSED TIME: {end_time-start_time}")
|
||||
# Check results
|
||||
|
||||
|
||||
# test_multiple_deployments()
|
||||
test_multiple_deployments()
|
||||
### FUNCTION CALLING
|
||||
|
||||
def test_function_calling():
|
||||
|
@ -174,10 +176,11 @@ def test_function_calling():
|
|||
}
|
||||
]
|
||||
|
||||
router = Router(model_list=model_list)
|
||||
router = Router(model_list=model_list, routing_strategy="latency-based-routing")
|
||||
response = router.completion(model="gpt-3.5-turbo-0613", messages=messages, functions=functions)
|
||||
print(response)
|
||||
|
||||
# test_function_calling()
|
||||
# ### FUNCTION CALLING -> NORMAL COMPLETION
|
||||
# def test_litellm_params_not_overwritten_by_function_calling():
|
||||
# try:
|
||||
|
@ -278,7 +281,7 @@ def test_acompletion_on_router():
|
|||
traceback.print_exc()
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
test_acompletion_on_router()
|
||||
# test_acompletion_on_router()
|
||||
|
||||
def test_function_calling_on_router():
|
||||
try:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue