forked from phoenix/litellm-mirror
test(caching_unit_tests.py): add unit tests for llm caching
ensures coverage for common caching scenarios across different implementations
This commit is contained in:
parent
0bc9864c09
commit
16bbed72d4
5 changed files with 244 additions and 188 deletions
|
@ -1103,193 +1103,6 @@ async def test_redis_cache_acompletion_stream_bedrock():
|
|||
raise e
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||
@pytest.mark.asyncio
|
||||
async def test_disk_cache_completion(sync_mode):
|
||||
litellm._turn_on_debug()
|
||||
|
||||
random_number = random.randint(
|
||||
1, 100000
|
||||
) # add a random number to ensure it's always adding / reading from cache
|
||||
messages = [
|
||||
{"role": "user", "content": f"write a one sentence poem about: {random_number}"}
|
||||
]
|
||||
litellm.cache = Cache(
|
||||
type="disk",
|
||||
)
|
||||
|
||||
if sync_mode:
|
||||
response1 = completion(
|
||||
"gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
caching=True,
|
||||
max_tokens=20,
|
||||
mock_response="This number is so great!",
|
||||
)
|
||||
else:
|
||||
response1 = await litellm.acompletion(
|
||||
"gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
caching=True,
|
||||
max_tokens=20,
|
||||
mock_response="This number is so great!",
|
||||
)
|
||||
# response2 is mocked to a different response from response1,
|
||||
# but the completion from the cache should be used instead of the mock
|
||||
# response since the input is the same as response1
|
||||
await asyncio.sleep(0.5)
|
||||
if sync_mode:
|
||||
response2 = completion(
|
||||
"gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
caching=True,
|
||||
max_tokens=20,
|
||||
mock_response="This number is great!",
|
||||
)
|
||||
else:
|
||||
response2 = await litellm.acompletion(
|
||||
"gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
caching=True,
|
||||
max_tokens=20,
|
||||
mock_response="This number is great!",
|
||||
)
|
||||
if (
|
||||
response1["choices"][0]["message"]["content"]
|
||||
!= response2["choices"][0]["message"]["content"]
|
||||
): # 1 and 2 should be the same
|
||||
# 1&2 have the exact same input params. This MUST Be a CACHE HIT
|
||||
print(f"response1: {response1}")
|
||||
print(f"response2: {response2}")
|
||||
pytest.fail(
|
||||
f"Error occurred: response1 - {response1['choices'][0]['message']['content']} != response2 - {response2['choices'][0]['message']['content']}"
|
||||
)
|
||||
# Since the parameters are not the same as response1, response3 should actually
|
||||
# be the mock response
|
||||
if sync_mode:
|
||||
response3 = completion(
|
||||
"gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
caching=True,
|
||||
temperature=0.5,
|
||||
mock_response="This number is awful!",
|
||||
)
|
||||
else:
|
||||
response3 = await litellm.acompletion(
|
||||
"gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
caching=True,
|
||||
temperature=0.5,
|
||||
mock_response="This number is awful!",
|
||||
)
|
||||
|
||||
print("\nresponse 1", response1)
|
||||
print("\nresponse 2", response2)
|
||||
print("\nresponse 3", response3)
|
||||
# print("\nresponse 4", response4)
|
||||
litellm.cache = None
|
||||
litellm.success_callback = []
|
||||
litellm._async_success_callback = []
|
||||
|
||||
# 1 & 2 should be exactly the same
|
||||
# 1 & 3 should be different, since input params are diff
|
||||
|
||||
if (
|
||||
response1["choices"][0]["message"]["content"]
|
||||
== response3["choices"][0]["message"]["content"]
|
||||
):
|
||||
# if input params like max_tokens, temperature are diff it should NOT be a cache hit
|
||||
print(f"response1: {response1}")
|
||||
print(f"response3: {response3}")
|
||||
pytest.fail(
|
||||
f"Response 1 == response 3. Same model, diff params shoudl not cache Error"
|
||||
f" occurred:"
|
||||
)
|
||||
|
||||
assert response1.id == response2.id
|
||||
assert response1.created == response2.created
|
||||
assert response1.choices[0].message.content == response2.choices[0].message.content
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||
@pytest.mark.asyncio
|
||||
async def test_disk_cache_embedding(sync_mode):
|
||||
litellm._turn_on_debug()
|
||||
|
||||
random_number = random.randint(
|
||||
1, 100000
|
||||
) # add a random number to ensure it's always adding / reading from cache
|
||||
input = [f"hello {random_number}"]
|
||||
litellm.cache = Cache(
|
||||
type="disk",
|
||||
)
|
||||
|
||||
if sync_mode:
|
||||
response1 = embedding(
|
||||
"openai/text-embedding-ada-002",
|
||||
input=input,
|
||||
caching=True,
|
||||
)
|
||||
else:
|
||||
response1 = await litellm.aembedding(
|
||||
"openai/text-embedding-ada-002",
|
||||
input=input,
|
||||
caching=True,
|
||||
)
|
||||
# response2 is mocked to a different response from response1,
|
||||
# but the completion from the cache should be used instead of the mock
|
||||
# response since the input is the same as response1
|
||||
await asyncio.sleep(0.5)
|
||||
if sync_mode:
|
||||
response2 = embedding(
|
||||
"openai/text-embedding-ada-002",
|
||||
input=input,
|
||||
caching=True,
|
||||
)
|
||||
else:
|
||||
response2 = await litellm.aembedding(
|
||||
"openai/text-embedding-ada-002",
|
||||
input=input,
|
||||
caching=True,
|
||||
)
|
||||
|
||||
if response2._hidden_params["cache_hit"] is not True:
|
||||
pytest.fail("Cache hit should be True")
|
||||
assert response1.id == response2.id
|
||||
# Since the parameters are not the same as response1, response3 should actually
|
||||
# be the mock response
|
||||
if sync_mode:
|
||||
response3 = embedding(
|
||||
"openai/text-embedding-ada-002",
|
||||
input=input,
|
||||
user="charlie",
|
||||
caching=True,
|
||||
)
|
||||
else:
|
||||
response3 = await litellm.acompletion(
|
||||
"openai/text-embedding-ada-002",
|
||||
input=input,
|
||||
caching=True,
|
||||
user="charlie",
|
||||
)
|
||||
|
||||
print("\nresponse 1", response1)
|
||||
print("\nresponse 2", response2)
|
||||
print("\nresponse 3", response3)
|
||||
# print("\nresponse 4", response4)
|
||||
litellm.cache = None
|
||||
litellm.success_callback = []
|
||||
litellm._async_success_callback = []
|
||||
|
||||
# 1 & 2 should be exactly the same
|
||||
# 1 & 3 should be different, since input params are diff
|
||||
|
||||
if response3._hidden_params["cache_hit"] is True:
|
||||
pytest.fail("Cache hit should not be True")
|
||||
|
||||
assert response1.id != response3.id
|
||||
|
||||
|
||||
# @pytest.mark.skip(reason="AWS Suspended Account")
|
||||
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||
@pytest.mark.asyncio
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue