mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 02:34:29 +00:00
* add track_llm_api_timing * add track_llm_api_timing * test_litellm_overhead * use ResponseMetadata class for setting hidden params and response overhead * instrument http handler * fix track_llm_api_timing * track_llm_api_timing * emit response overhead on hidden params * fix resp metadata * fix make_sync_openai_embedding_request * test_aaaaatext_completion_endpoint fixes * _get_value_from_hidden_params * set_hidden_params * test_litellm_overhead * test_litellm_overhead * test_litellm_overhead * fix import * test_litellm_overhead_stream * add LiteLLMLoggingObject * use diff folder for testing * use diff folder for overhead testing * test litellm overhead * use typing * clear typing * test_litellm_overhead * fix async_streaming * update_response_metadata * move test file * pply metadata to the response objec
116 lines
3.7 KiB
Python
116 lines
3.7 KiB
Python
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
from datetime import datetime
|
|
from unittest.mock import AsyncMock, patch, MagicMock
|
|
import pytest
|
|
|
|
sys.path.insert(
|
|
0, os.path.abspath("../..")
|
|
) # Adds the parent directory to the system path
|
|
import litellm
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize(
|
|
"model",
|
|
[
|
|
"bedrock/mistral.mistral-7b-instruct-v0:2",
|
|
"openai/gpt-4o",
|
|
"openai/self_hosted",
|
|
"bedrock/anthropic.claude-3-5-haiku-20241022-v1:0",
|
|
],
|
|
)
|
|
async def test_litellm_overhead(model):
|
|
|
|
litellm._turn_on_debug()
|
|
start_time = datetime.now()
|
|
if model == "openai/self_hosted":
|
|
response = await litellm.acompletion(
|
|
model=model,
|
|
messages=[{"role": "user", "content": "Hello, world!"}],
|
|
api_base="https://exampleopenaiendpoint-production.up.railway.app/",
|
|
)
|
|
else:
|
|
response = await litellm.acompletion(
|
|
model=model,
|
|
messages=[{"role": "user", "content": "Hello, world!"}],
|
|
)
|
|
end_time = datetime.now()
|
|
total_time_ms = (end_time - start_time).total_seconds() * 1000
|
|
print(response)
|
|
print(response._hidden_params)
|
|
litellm_overhead_ms = response._hidden_params["litellm_overhead_time_ms"]
|
|
# calculate percent of overhead caused by litellm
|
|
overhead_percent = litellm_overhead_ms * 100 / total_time_ms
|
|
print("##########################\n")
|
|
print("total_time_ms", total_time_ms)
|
|
print("response litellm_overhead_ms", litellm_overhead_ms)
|
|
print("litellm overhead_percent {}%".format(overhead_percent))
|
|
print("##########################\n")
|
|
assert litellm_overhead_ms > 0
|
|
assert litellm_overhead_ms < 1000
|
|
|
|
# latency overhead should be less than total request time
|
|
assert litellm_overhead_ms < (end_time - start_time).total_seconds() * 1000
|
|
|
|
# latency overhead should be under 40% of total request time
|
|
assert overhead_percent < 40
|
|
|
|
pass
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize(
|
|
"model",
|
|
[
|
|
"bedrock/mistral.mistral-7b-instruct-v0:2",
|
|
"openai/gpt-4o",
|
|
"bedrock/anthropic.claude-3-5-haiku-20241022-v1:0",
|
|
"openai/self_hosted",
|
|
],
|
|
)
|
|
async def test_litellm_overhead_stream(model):
|
|
|
|
litellm._turn_on_debug()
|
|
start_time = datetime.now()
|
|
if model == "openai/self_hosted":
|
|
response = await litellm.acompletion(
|
|
model=model,
|
|
messages=[{"role": "user", "content": "Hello, world!"}],
|
|
api_base="https://exampleopenaiendpoint-production.up.railway.app/",
|
|
stream=True,
|
|
)
|
|
else:
|
|
response = await litellm.acompletion(
|
|
model=model,
|
|
messages=[{"role": "user", "content": "Hello, world!"}],
|
|
stream=True,
|
|
)
|
|
|
|
async for chunk in response:
|
|
print()
|
|
|
|
end_time = datetime.now()
|
|
total_time_ms = (end_time - start_time).total_seconds() * 1000
|
|
print(response)
|
|
print(response._hidden_params)
|
|
litellm_overhead_ms = response._hidden_params["litellm_overhead_time_ms"]
|
|
# calculate percent of overhead caused by litellm
|
|
overhead_percent = litellm_overhead_ms * 100 / total_time_ms
|
|
print("##########################\n")
|
|
print("total_time_ms", total_time_ms)
|
|
print("response litellm_overhead_ms", litellm_overhead_ms)
|
|
print("litellm overhead_percent {}%".format(overhead_percent))
|
|
print("##########################\n")
|
|
assert litellm_overhead_ms > 0
|
|
assert litellm_overhead_ms < 1000
|
|
|
|
# latency overhead should be less than total request time
|
|
assert litellm_overhead_ms < (end_time - start_time).total_seconds() * 1000
|
|
|
|
# latency overhead should be under 40% of total request time
|
|
assert overhead_percent < 40
|
|
|
|
pass
|