(Feat) Add x-litellm-overhead-duration-ms and "x-litellm-response-duration-ms" in response from LiteLLM (#7899)

* add track_llm_api_timing

* add track_llm_api_timing

* test_litellm_overhead

* use ResponseMetadata class for setting hidden params and response overhead

* instrument http handler

* fix track_llm_api_timing

* track_llm_api_timing

* emit response overhead on hidden params

* fix resp metadata

* fix make_sync_openai_embedding_request

* test_aaaaatext_completion_endpoint fixes

* _get_value_from_hidden_params

* set_hidden_params

* test_litellm_overhead

* test_litellm_overhead

* test_litellm_overhead

* fix import

* test_litellm_overhead_stream

* add LiteLLMLoggingObject

* use diff folder for testing

* use diff folder for overhead testing

* test litellm overhead

* use typing

* clear typing

* test_litellm_overhead

* fix async_streaming

* update_response_metadata

* move test file

* pply metadata to the response objec
This commit is contained in:
Ishaan Jaff 2025-01-21 20:27:55 -08:00 committed by GitHub
parent 63d7d04232
commit b6f2e659b9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 464 additions and 73 deletions

View file

@ -733,11 +733,13 @@ def get_custom_headers(
version: Optional[str] = None,
model_region: Optional[str] = None,
response_cost: Optional[Union[float, str]] = None,
hidden_params: Optional[dict] = None,
fastest_response_batch_completion: Optional[bool] = None,
request_data: Optional[dict] = {},
**kwargs,
) -> dict:
exclude_values = {"", None}
hidden_params = hidden_params or {}
headers = {
"x-litellm-call-id": call_id,
"x-litellm-model-id": model_id,
@ -750,6 +752,10 @@ def get_custom_headers(
"x-litellm-key-rpm-limit": str(user_api_key_dict.rpm_limit),
"x-litellm-key-max-budget": str(user_api_key_dict.max_budget),
"x-litellm-key-spend": str(user_api_key_dict.spend),
"x-litellm-response-duration-ms": str(hidden_params.get("_response_ms", None)),
"x-litellm-overhead-duration-ms": str(
hidden_params.get("litellm_overhead_time_ms", None)
),
"x-litellm-fastest_response_batch_completion": (
str(fastest_response_batch_completion)
if fastest_response_batch_completion is not None
@ -3491,6 +3497,7 @@ async def chat_completion( # noqa: PLR0915
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
fastest_response_batch_completion=fastest_response_batch_completion,
request_data=data,
hidden_params=hidden_params,
**additional_headers,
)
selected_data_generator = select_data_generator(
@ -3526,6 +3533,7 @@ async def chat_completion( # noqa: PLR0915
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
fastest_response_batch_completion=fastest_response_batch_completion,
request_data=data,
hidden_params=hidden_params,
**additional_headers,
)
)
@ -3719,6 +3727,7 @@ async def completion( # noqa: PLR0915
api_base=api_base,
version=version,
response_cost=response_cost,
hidden_params=hidden_params,
request_data=data,
)
selected_data_generator = select_data_generator(
@ -3747,6 +3756,7 @@ async def completion( # noqa: PLR0915
version=version,
response_cost=response_cost,
request_data=data,
hidden_params=hidden_params,
)
)
await check_response_size_is_safe(response=response)
@ -3977,6 +3987,7 @@ async def embeddings( # noqa: PLR0915
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
call_id=litellm_call_id,
request_data=data,
hidden_params=hidden_params,
**additional_headers,
)
)
@ -4103,6 +4114,7 @@ async def image_generation(
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
call_id=litellm_call_id,
request_data=data,
hidden_params=hidden_params,
)
)
@ -4223,6 +4235,7 @@ async def audio_speech(
fastest_response_batch_completion=None,
call_id=litellm_call_id,
request_data=data,
hidden_params=hidden_params,
)
select_data_generator(
@ -4362,6 +4375,7 @@ async def audio_transcriptions(
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
call_id=litellm_call_id,
request_data=data,
hidden_params=hidden_params,
**additional_headers,
)
)
@ -4510,6 +4524,7 @@ async def get_assistants(
version=version,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
request_data=data,
hidden_params=hidden_params,
)
)
@ -4607,6 +4622,7 @@ async def create_assistant(
version=version,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
request_data=data,
hidden_params=hidden_params,
)
)
@ -4703,6 +4719,7 @@ async def delete_assistant(
version=version,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
request_data=data,
hidden_params=hidden_params,
)
)
@ -4799,6 +4816,7 @@ async def create_threads(
version=version,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
request_data=data,
hidden_params=hidden_params,
)
)
@ -4894,6 +4912,7 @@ async def get_thread(
version=version,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
request_data=data,
hidden_params=hidden_params,
)
)
@ -4992,6 +5011,7 @@ async def add_messages(
version=version,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
request_data=data,
hidden_params=hidden_params,
)
)
@ -5086,6 +5106,7 @@ async def get_messages(
version=version,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
request_data=data,
hidden_params=hidden_params,
)
)
@ -5194,6 +5215,7 @@ async def run_thread(
version=version,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
request_data=data,
hidden_params=hidden_params,
)
)
@ -5316,6 +5338,7 @@ async def moderations(
version=version,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
request_data=data,
hidden_params=hidden_params,
)
)
@ -5488,6 +5511,7 @@ async def anthropic_response( # noqa: PLR0915
version=version,
response_cost=response_cost,
request_data=data,
hidden_params=hidden_params,
)
)