mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 18:54:30 +00:00
(Feat) Add x-litellm-overhead-duration-ms and "x-litellm-response-duration-ms" in response from LiteLLM (#7899)
* add track_llm_api_timing * add track_llm_api_timing * test_litellm_overhead * use ResponseMetadata class for setting hidden params and response overhead * instrument http handler * fix track_llm_api_timing * track_llm_api_timing * emit response overhead on hidden params * fix resp metadata * fix make_sync_openai_embedding_request * test_aaaaatext_completion_endpoint fixes * _get_value_from_hidden_params * set_hidden_params * test_litellm_overhead * test_litellm_overhead * test_litellm_overhead * fix import * test_litellm_overhead_stream * add LiteLLMLoggingObject * use diff folder for testing * use diff folder for overhead testing * test litellm overhead * use typing * clear typing * test_litellm_overhead * fix async_streaming * update_response_metadata * move test file * pply metadata to the response objec
This commit is contained in:
parent
63d7d04232
commit
b6f2e659b9
17 changed files with 464 additions and 73 deletions
116
tests/litellm_utils_tests/test_litellm_overhead.py
Normal file
116
tests/litellm_utils_tests/test_litellm_overhead.py
Normal file
|
@ -0,0 +1,116 @@
|
|||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from unittest.mock import AsyncMock, patch, MagicMock
|
||||
import pytest
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import litellm
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"bedrock/mistral.mistral-7b-instruct-v0:2",
|
||||
"openai/gpt-4o",
|
||||
"openai/self_hosted",
|
||||
"bedrock/anthropic.claude-3-5-haiku-20241022-v1:0",
|
||||
],
|
||||
)
|
||||
async def test_litellm_overhead(model):
|
||||
|
||||
litellm._turn_on_debug()
|
||||
start_time = datetime.now()
|
||||
if model == "openai/self_hosted":
|
||||
response = await litellm.acompletion(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "Hello, world!"}],
|
||||
api_base="https://exampleopenaiendpoint-production.up.railway.app/",
|
||||
)
|
||||
else:
|
||||
response = await litellm.acompletion(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "Hello, world!"}],
|
||||
)
|
||||
end_time = datetime.now()
|
||||
total_time_ms = (end_time - start_time).total_seconds() * 1000
|
||||
print(response)
|
||||
print(response._hidden_params)
|
||||
litellm_overhead_ms = response._hidden_params["litellm_overhead_time_ms"]
|
||||
# calculate percent of overhead caused by litellm
|
||||
overhead_percent = litellm_overhead_ms * 100 / total_time_ms
|
||||
print("##########################\n")
|
||||
print("total_time_ms", total_time_ms)
|
||||
print("response litellm_overhead_ms", litellm_overhead_ms)
|
||||
print("litellm overhead_percent {}%".format(overhead_percent))
|
||||
print("##########################\n")
|
||||
assert litellm_overhead_ms > 0
|
||||
assert litellm_overhead_ms < 1000
|
||||
|
||||
# latency overhead should be less than total request time
|
||||
assert litellm_overhead_ms < (end_time - start_time).total_seconds() * 1000
|
||||
|
||||
# latency overhead should be under 40% of total request time
|
||||
assert overhead_percent < 40
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"bedrock/mistral.mistral-7b-instruct-v0:2",
|
||||
"openai/gpt-4o",
|
||||
"bedrock/anthropic.claude-3-5-haiku-20241022-v1:0",
|
||||
"openai/self_hosted",
|
||||
],
|
||||
)
|
||||
async def test_litellm_overhead_stream(model):
|
||||
|
||||
litellm._turn_on_debug()
|
||||
start_time = datetime.now()
|
||||
if model == "openai/self_hosted":
|
||||
response = await litellm.acompletion(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "Hello, world!"}],
|
||||
api_base="https://exampleopenaiendpoint-production.up.railway.app/",
|
||||
stream=True,
|
||||
)
|
||||
else:
|
||||
response = await litellm.acompletion(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "Hello, world!"}],
|
||||
stream=True,
|
||||
)
|
||||
|
||||
async for chunk in response:
|
||||
print()
|
||||
|
||||
end_time = datetime.now()
|
||||
total_time_ms = (end_time - start_time).total_seconds() * 1000
|
||||
print(response)
|
||||
print(response._hidden_params)
|
||||
litellm_overhead_ms = response._hidden_params["litellm_overhead_time_ms"]
|
||||
# calculate percent of overhead caused by litellm
|
||||
overhead_percent = litellm_overhead_ms * 100 / total_time_ms
|
||||
print("##########################\n")
|
||||
print("total_time_ms", total_time_ms)
|
||||
print("response litellm_overhead_ms", litellm_overhead_ms)
|
||||
print("litellm overhead_percent {}%".format(overhead_percent))
|
||||
print("##########################\n")
|
||||
assert litellm_overhead_ms > 0
|
||||
assert litellm_overhead_ms < 1000
|
||||
|
||||
# latency overhead should be less than total request time
|
||||
assert litellm_overhead_ms < (end_time - start_time).total_seconds() * 1000
|
||||
|
||||
# latency overhead should be under 40% of total request time
|
||||
assert overhead_percent < 40
|
||||
|
||||
pass
|
Loading…
Add table
Add a link
Reference in a new issue