mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 19:24:27 +00:00
fix(router.py): add modelgroup to call metadata
This commit is contained in:
parent
b6fe6b2839
commit
3a8d7ec835
3 changed files with 71 additions and 131 deletions
|
@ -62,6 +62,10 @@ class InMemoryCache(BaseCache):
|
||||||
cached_response['cache'] = True # set cache-hit flag to True
|
cached_response['cache'] = True # set cache-hit flag to True
|
||||||
return cached_response
|
return cached_response
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def flush_cache(self):
|
||||||
|
self.cache_dict.clear()
|
||||||
|
self.ttl_dict.clear()
|
||||||
|
|
||||||
|
|
||||||
class RedisCache(BaseCache):
|
class RedisCache(BaseCache):
|
||||||
|
@ -97,6 +101,9 @@ class RedisCache(BaseCache):
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
logging.debug("LiteLLM Caching: get() - Got exception from REDIS: ", e)
|
logging.debug("LiteLLM Caching: get() - Got exception from REDIS: ", e)
|
||||||
|
|
||||||
|
def flush_cache(self):
|
||||||
|
self.redis_client.flushall()
|
||||||
|
|
||||||
class DualCache(BaseCache):
|
class DualCache(BaseCache):
|
||||||
"""
|
"""
|
||||||
This updates both Redis and an in-memory cache simultaneously.
|
This updates both Redis and an in-memory cache simultaneously.
|
||||||
|
@ -147,6 +154,10 @@ class DualCache(BaseCache):
|
||||||
return result
|
return result
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
def flush_cache(self):
|
||||||
|
self.redis_cache.flush_cache()
|
||||||
|
self.in_memory_cache.flush_cache()
|
||||||
|
|
||||||
#### LiteLLM.Completion Cache ####
|
#### LiteLLM.Completion Cache ####
|
||||||
class Cache:
|
class Cache:
|
||||||
|
|
|
@ -115,7 +115,7 @@ class Router:
|
||||||
if cache_responses:
|
if cache_responses:
|
||||||
litellm.cache = litellm.Cache(**cache_config) # use Redis for caching completion requests
|
litellm.cache = litellm.Cache(**cache_config) # use Redis for caching completion requests
|
||||||
self.cache_responses = cache_responses
|
self.cache_responses = cache_responses
|
||||||
self.cache = DualCache(redis_cache=redis_cache) # use a dual cache (Redis+In-Memory) for tracking cooldowns, usage, etc.
|
self.cache = DualCache(redis_cache=redis_cache, in_memory_cache=InMemoryCache()) # use a dual cache (Redis+In-Memory) for tracking cooldowns, usage, etc.
|
||||||
## USAGE TRACKING ##
|
## USAGE TRACKING ##
|
||||||
if isinstance(litellm.success_callback, list):
|
if isinstance(litellm.success_callback, list):
|
||||||
litellm.success_callback.append(self.deployment_callback)
|
litellm.success_callback.append(self.deployment_callback)
|
||||||
|
@ -143,6 +143,7 @@ class Router:
|
||||||
kwargs["messages"] = messages
|
kwargs["messages"] = messages
|
||||||
kwargs["original_function"] = self._completion
|
kwargs["original_function"] = self._completion
|
||||||
kwargs["num_retries"] = self.num_retries
|
kwargs["num_retries"] = self.num_retries
|
||||||
|
kwargs.setdefault("metadata", {}).update({"model_group": model})
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
||||||
# Submit the function to the executor with a timeout
|
# Submit the function to the executor with a timeout
|
||||||
future = executor.submit(self.function_with_fallbacks, **kwargs)
|
future = executor.submit(self.function_with_fallbacks, **kwargs)
|
||||||
|
@ -180,7 +181,7 @@ class Router:
|
||||||
kwargs["messages"] = messages
|
kwargs["messages"] = messages
|
||||||
kwargs["original_function"] = self._acompletion
|
kwargs["original_function"] = self._acompletion
|
||||||
kwargs["num_retries"] = self.num_retries
|
kwargs["num_retries"] = self.num_retries
|
||||||
|
kwargs.setdefault("metadata", {}).update({"model_group": model})
|
||||||
# Use asyncio.timeout to enforce the timeout
|
# Use asyncio.timeout to enforce the timeout
|
||||||
async with asyncio.timeout(self.timeout): # type: ignore
|
async with asyncio.timeout(self.timeout): # type: ignore
|
||||||
response = await self.async_function_with_fallbacks(**kwargs)
|
response = await self.async_function_with_fallbacks(**kwargs)
|
||||||
|
@ -215,6 +216,7 @@ class Router:
|
||||||
is_async: Optional[bool] = False,
|
is_async: Optional[bool] = False,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
try:
|
try:
|
||||||
|
kwargs.setdefault("metadata", {}).update({"model_group": model})
|
||||||
messages=[{"role": "user", "content": prompt}]
|
messages=[{"role": "user", "content": prompt}]
|
||||||
# pick the one that is available (lowest TPM/RPM)
|
# pick the one that is available (lowest TPM/RPM)
|
||||||
deployment = self.get_available_deployment(model=model, messages=messages)
|
deployment = self.get_available_deployment(model=model, messages=messages)
|
||||||
|
@ -241,6 +243,7 @@ class Router:
|
||||||
is_async: Optional[bool] = False,
|
is_async: Optional[bool] = False,
|
||||||
**kwargs) -> Union[List[float], None]:
|
**kwargs) -> Union[List[float], None]:
|
||||||
# pick the one that is available (lowest TPM/RPM)
|
# pick the one that is available (lowest TPM/RPM)
|
||||||
|
kwargs.setdefault("metadata", {}).update({"model_group": model})
|
||||||
deployment = self.get_available_deployment(model=model, input=input)
|
deployment = self.get_available_deployment(model=model, input=input)
|
||||||
|
|
||||||
data = deployment["litellm_params"]
|
data = deployment["litellm_params"]
|
||||||
|
@ -256,6 +259,7 @@ class Router:
|
||||||
is_async: Optional[bool] = True,
|
is_async: Optional[bool] = True,
|
||||||
**kwargs) -> Union[List[float], None]:
|
**kwargs) -> Union[List[float], None]:
|
||||||
# pick the one that is available (lowest TPM/RPM)
|
# pick the one that is available (lowest TPM/RPM)
|
||||||
|
kwargs.setdefault("metadata", {}).update({"model_group": model})
|
||||||
deployment = self.get_available_deployment(model=model, input=input)
|
deployment = self.get_available_deployment(model=model, input=input)
|
||||||
|
|
||||||
data = deployment["litellm_params"]
|
data = deployment["litellm_params"]
|
||||||
|
@ -420,8 +424,6 @@ class Router:
|
||||||
raise e
|
raise e
|
||||||
raise original_exception
|
raise original_exception
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def function_with_retries(self, *args, **kwargs):
|
def function_with_retries(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Try calling the model 3 times. Shuffle between available deployments.
|
Try calling the model 3 times. Shuffle between available deployments.
|
||||||
|
@ -761,4 +763,6 @@ class Router:
|
||||||
return self.get_usage_based_available_deployment(model=model, messages=messages, input=input)
|
return self.get_usage_based_available_deployment(model=model, messages=messages, input=input)
|
||||||
|
|
||||||
raise ValueError("No models available.")
|
raise ValueError("No models available.")
|
||||||
|
|
||||||
|
def flush_cache(self):
|
||||||
|
self.cache.flush_cache()
|
|
@ -13,81 +13,51 @@ from concurrent.futures import ThreadPoolExecutor
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
# def test_multiple_deployments():
|
def test_multiple_deployments():
|
||||||
# import concurrent, time
|
import concurrent, time
|
||||||
# litellm.set_verbose=False
|
litellm.set_verbose=False
|
||||||
# futures = {}
|
futures = {}
|
||||||
# model_list = [{ # list of model deployments
|
model_list = [{ # list of model deployments
|
||||||
# "model_name": "gpt-3.5-turbo", # openai model name
|
"model_name": "gpt-3.5-turbo", # openai model name
|
||||||
# "litellm_params": { # params for litellm completion/embedding call
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
# "model": "azure/chatgpt-v-2",
|
"model": "azure/chatgpt-v-2",
|
||||||
# "api_key": "bad-key",
|
"api_key": "bad-key",
|
||||||
# "api_version": os.getenv("AZURE_API_VERSION"),
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
# "api_base": os.getenv("AZURE_API_BASE")
|
"api_base": os.getenv("AZURE_API_BASE")
|
||||||
# },
|
},
|
||||||
# "tpm": 240000,
|
"tpm": 240000,
|
||||||
# "rpm": 1800
|
"rpm": 1800
|
||||||
# },
|
},
|
||||||
# # {
|
{
|
||||||
# # "model_name": "gpt-3.5-turbo", # openai model name
|
"model_name": "gpt-3.5-turbo", # openai model name
|
||||||
# # "litellm_params": { # params for litellm completion/embedding call
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
# # "model": "azure/chatgpt-functioncalling",
|
"model": "gpt-3.5-turbo",
|
||||||
# # "api_key": "bad-key",
|
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||||
# # "api_version": os.getenv("AZURE_API_VERSION"),
|
},
|
||||||
# # "api_base": os.getenv("AZURE_API_BASE")
|
"tpm": 1000000,
|
||||||
# # },
|
"rpm": 9000
|
||||||
# # "tpm": 240000,
|
}
|
||||||
# # "rpm": 1800
|
]
|
||||||
# # },
|
|
||||||
# {
|
|
||||||
# "model_name": "gpt-3.5-turbo", # openai model name
|
|
||||||
# "litellm_params": { # params for litellm completion/embedding call
|
|
||||||
# "model": "gpt-3.5-turbo",
|
|
||||||
# "api_key": os.getenv("OPENAI_API_KEY"),
|
|
||||||
# },
|
|
||||||
# "tpm": 1000000,
|
|
||||||
# "rpm": 9000
|
|
||||||
# }
|
|
||||||
# ]
|
|
||||||
|
|
||||||
# router = Router(model_list=model_list,
|
router = Router(model_list=model_list,
|
||||||
# redis_host=os.getenv("REDIS_HOST"),
|
redis_host=os.getenv("REDIS_HOST"),
|
||||||
# redis_password=os.getenv("REDIS_PASSWORD"),
|
redis_password=os.getenv("REDIS_PASSWORD"),
|
||||||
# redis_port=int(os.getenv("REDIS_PORT")),
|
redis_port=int(os.getenv("REDIS_PORT")),
|
||||||
# routing_strategy="simple-shuffle",
|
routing_strategy="simple-shuffle",
|
||||||
# set_verbose=False,
|
set_verbose=False,
|
||||||
# num_retries=1) # type: ignore
|
num_retries=1) # type: ignore
|
||||||
# # router = Router(model_list=model_list, redis_host=os.getenv("REDIS_HOST"), redis_password=os.getenv("REDIS_PASSWORD"), redis_port=int(os.getenv("REDIS_PORT"))) # type: ignore
|
kwargs = {"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "Hey, how's it going?"}],}
|
||||||
# kwargs = {
|
|
||||||
# "model": "gpt-3.5-turbo",
|
|
||||||
# "messages": [{"role": "user", "content": """Context:
|
|
||||||
|
|
||||||
# In the historical era of Ancient Greece, a multitude of significant individuals lived, contributing immensely to various disciplines like science, politics, philosophy, and literature. For instance, Socrates, a renowned philosopher, primarily focused on ethics. His notable method, the Socratic Method, involved acknowledging one's own ignorance to stimulate critical thinking and illuminate ideas. His student, Plato, another prominent figure, founded the Academy in Athens. He proposed theories on justice, beauty, and equality, and also introduced the theory of forms, which is pivotal to understanding his philosophical insights. Another student of Socrates, Xenophon, distinguished himself more in the domain of history and military affairs.
|
|
||||||
|
|
||||||
# Aristotle, who studied under Plato, led an equally remarkable life. His extensive works have been influential across various domains, including science, logic, metaphysics, ethics, and politics. Perhaps most notably, a substantial portion of the Western intellectual tradition traces back to his writings. He later tutored Alexander the Great who went on to create one of the most vast empires in the world.
|
|
||||||
|
|
||||||
# In the domain of mathematics, Pythagoras and Euclid made significant contributions. Pythagoras is best known for the Pythagorean theorem, a fundamental principle in geometry, while Euclid, often regarded as the father of geometry, wrote "The Elements", a collection of definitions, axioms, theorems, and proofs.
|
|
||||||
|
|
||||||
# Apart from these luminaries, the period also saw a number of influential political figures. Pericles, a prominent and influential Greek statesman, orator, and general of Athens during the Golden Age, specifically between the Persian and Peloponnesian wars, played a significant role in developing the Athenian democracy.
|
|
||||||
|
|
||||||
# The Ancient Greek era also witnessed extraordinary advancements in arts and literature. Homer, credited with the creation of the epic poems 'The Iliad' and 'The Odyssey,' is considered one of the greatest poets in history. The tragedies of Sophocles, Aeschylus, and Euripides left an indelible mark on the field of drama, and the comedies of Aristophanes remain influential even today.
|
|
||||||
|
|
||||||
# ---
|
|
||||||
# Question:
|
|
||||||
|
|
||||||
# Who among the mentioned figures from Ancient Greece contributed to the domain of mathematics and what are their significant contributions?"""}],
|
|
||||||
# }
|
|
||||||
|
|
||||||
# results = []
|
results = []
|
||||||
|
|
||||||
# try:
|
try:
|
||||||
# for _ in range(3):
|
for _ in range(3):
|
||||||
# response = router.completion(**kwargs)
|
response = router.completion(**kwargs)
|
||||||
# results.append(response)
|
results.append(response)
|
||||||
# except Exception as e:
|
router.flush_cache()
|
||||||
# raise e
|
except Exception as e:
|
||||||
# print(len(results))
|
print(f"FAILED TEST!")
|
||||||
# with ThreadPoolExecutor(max_workers=100) as executor:
|
pytest.fail(f"An error occurred - {str(e)}")
|
||||||
|
|
||||||
# start_time = time.time()
|
# start_time = time.time()
|
||||||
# for _ in range(1000):
|
# for _ in range(1000):
|
||||||
|
@ -113,7 +83,7 @@ load_dotenv()
|
||||||
# Check results
|
# Check results
|
||||||
|
|
||||||
|
|
||||||
# test_multiple_deployments()
|
test_multiple_deployments()
|
||||||
|
|
||||||
def test_exception_raising():
|
def test_exception_raising():
|
||||||
# this tests if the router raises an exception when invalid params are set
|
# this tests if the router raises an exception when invalid params are set
|
||||||
|
@ -163,9 +133,11 @@ def test_exception_raising():
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
os.environ["AZURE_API_KEY"] = old_api_key
|
os.environ["AZURE_API_KEY"] = old_api_key
|
||||||
|
pytest.fail(f"Should have raised an Auth Error")
|
||||||
except openai.AuthenticationError:
|
except openai.AuthenticationError:
|
||||||
print("Test Passed: Caught an OPENAI AUTH Error, Good job. This is what we needed!")
|
print("Test Passed: Caught an OPENAI AUTH Error, Good job. This is what we needed!")
|
||||||
os.environ["AZURE_API_KEY"] = old_api_key
|
os.environ["AZURE_API_KEY"] = old_api_key
|
||||||
|
router.flush_cache()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
os.environ["AZURE_API_KEY"] = old_api_key
|
os.environ["AZURE_API_KEY"] = old_api_key
|
||||||
print("Got unexpected exception on router!", e)
|
print("Got unexpected exception on router!", e)
|
||||||
|
@ -175,7 +147,7 @@ def test_exception_raising():
|
||||||
def test_reading_key_from_model_list():
|
def test_reading_key_from_model_list():
|
||||||
# this tests if the router raises an exception when invalid params are set
|
# this tests if the router raises an exception when invalid params are set
|
||||||
# DO NOT REMOVE THIS TEST. It's an IMP ONE. Speak to Ishaan, if you are tring to remove this
|
# DO NOT REMOVE THIS TEST. It's an IMP ONE. Speak to Ishaan, if you are tring to remove this
|
||||||
litellm.set_verbose=True
|
litellm.set_verbose=False
|
||||||
import openai
|
import openai
|
||||||
try:
|
try:
|
||||||
print("testing if router raises an exception")
|
print("testing if router raises an exception")
|
||||||
|
@ -212,8 +184,10 @@ def test_reading_key_from_model_list():
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
os.environ["AZURE_API_KEY"] = old_api_key
|
os.environ["AZURE_API_KEY"] = old_api_key
|
||||||
|
router.flush_cache()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
os.environ["AZURE_API_KEY"] = old_api_key
|
os.environ["AZURE_API_KEY"] = old_api_key
|
||||||
|
print(f"FAILED TEST")
|
||||||
pytest.fail("Got unexpected exception on router!", e)
|
pytest.fail("Got unexpected exception on router!", e)
|
||||||
# test_reading_key_from_model_list()
|
# test_reading_key_from_model_list()
|
||||||
|
|
||||||
|
@ -261,55 +235,6 @@ def test_function_calling():
|
||||||
response = router.completion(model="gpt-3.5-turbo-0613", messages=messages, functions=functions)
|
response = router.completion(model="gpt-3.5-turbo-0613", messages=messages, functions=functions)
|
||||||
print(response)
|
print(response)
|
||||||
|
|
||||||
# test_function_calling()
|
|
||||||
# ### FUNCTION CALLING -> NORMAL COMPLETION
|
|
||||||
# def test_litellm_params_not_overwritten_by_function_calling():
|
|
||||||
# try:
|
|
||||||
# model_list = [
|
|
||||||
# {
|
|
||||||
# "model_name": "gpt-3.5-turbo-0613",
|
|
||||||
# "litellm_params": {
|
|
||||||
# "model": "gpt-3.5-turbo-0613",
|
|
||||||
# "api_key": os.getenv("OPENAI_API_KEY"),
|
|
||||||
# },
|
|
||||||
# "tpm": 100000,
|
|
||||||
# "rpm": 10000,
|
|
||||||
# },
|
|
||||||
# ]
|
|
||||||
|
|
||||||
# messages = [
|
|
||||||
# {"role": "user", "content": "What is the weather like in Boston?"}
|
|
||||||
# ]
|
|
||||||
# functions = [
|
|
||||||
# {
|
|
||||||
# "name": "get_current_weather",
|
|
||||||
# "description": "Get the current weather in a given location",
|
|
||||||
# "parameters": {
|
|
||||||
# "type": "object",
|
|
||||||
# "properties": {
|
|
||||||
# "location": {
|
|
||||||
# "type": "string",
|
|
||||||
# "description": "The city and state, e.g. San Francisco, CA"
|
|
||||||
# },
|
|
||||||
# "unit": {
|
|
||||||
# "type": "string",
|
|
||||||
# "enum": ["celsius", "fahrenheit"]
|
|
||||||
# }
|
|
||||||
# },
|
|
||||||
# "required": ["location"]
|
|
||||||
# }
|
|
||||||
# }
|
|
||||||
# ]
|
|
||||||
|
|
||||||
# router = Router(model_list=model_list)
|
|
||||||
# _ = router.completion(model="gpt-3.5-turbo-0613", messages=messages, functions=functions)
|
|
||||||
# response = router.completion(model="gpt-3.5-turbo-0613", messages=messages)
|
|
||||||
# assert response.choices[0].finish_reason != "function_call"
|
|
||||||
# except Exception as e:
|
|
||||||
# pytest.fail(f"Error occurred: {e}")
|
|
||||||
|
|
||||||
# test_litellm_params_not_overwritten_by_function_calling()
|
|
||||||
|
|
||||||
def test_acompletion_on_router():
|
def test_acompletion_on_router():
|
||||||
try:
|
try:
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
|
@ -337,7 +262,7 @@ def test_acompletion_on_router():
|
||||||
]
|
]
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
{"role": "user", "content": "What is the weather like in Boston?"}
|
{"role": "user", "content": f"What is the weather like in Boston {time.time()}?"}
|
||||||
]
|
]
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
router = Router(model_list=model_list,
|
router = Router(model_list=model_list,
|
||||||
|
@ -352,7 +277,7 @@ def test_acompletion_on_router():
|
||||||
print(f"response1: {response1}")
|
print(f"response1: {response1}")
|
||||||
response2 = await router.acompletion(model="gpt-3.5-turbo", messages=messages)
|
response2 = await router.acompletion(model="gpt-3.5-turbo", messages=messages)
|
||||||
print(f"response2: {response2}")
|
print(f"response2: {response2}")
|
||||||
assert response1["choices"][0]["message"]["content"] == response2["choices"][0]["message"]["content"]
|
assert response1.id == response2.id
|
||||||
asyncio.run(get_response())
|
asyncio.run(get_response())
|
||||||
except litellm.Timeout as e:
|
except litellm.Timeout as e:
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
|
@ -362,7 +287,7 @@ def test_acompletion_on_router():
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
# test_acompletion_on_router()
|
test_acompletion_on_router()
|
||||||
|
|
||||||
def test_function_calling_on_router():
|
def test_function_calling_on_router():
|
||||||
try:
|
try:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue