mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 18:54:30 +00:00
(Feat) Add x-litellm-overhead-duration-ms and "x-litellm-response-duration-ms" in response from LiteLLM (#7899)
* add track_llm_api_timing * add track_llm_api_timing * test_litellm_overhead * use ResponseMetadata class for setting hidden params and response overhead * instrument http handler * fix track_llm_api_timing * track_llm_api_timing * emit response overhead on hidden params * fix resp metadata * fix make_sync_openai_embedding_request * test_aaaaatext_completion_endpoint fixes * _get_value_from_hidden_params * set_hidden_params * test_litellm_overhead * test_litellm_overhead * test_litellm_overhead * fix import * test_litellm_overhead_stream * add LiteLLMLoggingObject * use diff folder for testing * use diff folder for overhead testing * test litellm overhead * use typing * clear typing * test_litellm_overhead * fix async_streaming * update_response_metadata * move test file * pply metadata to the response objec
This commit is contained in:
parent
2c7082853c
commit
359a4ee3a9
17 changed files with 464 additions and 73 deletions
|
@ -733,11 +733,13 @@ def get_custom_headers(
|
|||
version: Optional[str] = None,
|
||||
model_region: Optional[str] = None,
|
||||
response_cost: Optional[Union[float, str]] = None,
|
||||
hidden_params: Optional[dict] = None,
|
||||
fastest_response_batch_completion: Optional[bool] = None,
|
||||
request_data: Optional[dict] = {},
|
||||
**kwargs,
|
||||
) -> dict:
|
||||
exclude_values = {"", None}
|
||||
hidden_params = hidden_params or {}
|
||||
headers = {
|
||||
"x-litellm-call-id": call_id,
|
||||
"x-litellm-model-id": model_id,
|
||||
|
@ -750,6 +752,10 @@ def get_custom_headers(
|
|||
"x-litellm-key-rpm-limit": str(user_api_key_dict.rpm_limit),
|
||||
"x-litellm-key-max-budget": str(user_api_key_dict.max_budget),
|
||||
"x-litellm-key-spend": str(user_api_key_dict.spend),
|
||||
"x-litellm-response-duration-ms": str(hidden_params.get("_response_ms", None)),
|
||||
"x-litellm-overhead-duration-ms": str(
|
||||
hidden_params.get("litellm_overhead_time_ms", None)
|
||||
),
|
||||
"x-litellm-fastest_response_batch_completion": (
|
||||
str(fastest_response_batch_completion)
|
||||
if fastest_response_batch_completion is not None
|
||||
|
@ -3491,6 +3497,7 @@ async def chat_completion( # noqa: PLR0915
|
|||
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
|
||||
fastest_response_batch_completion=fastest_response_batch_completion,
|
||||
request_data=data,
|
||||
hidden_params=hidden_params,
|
||||
**additional_headers,
|
||||
)
|
||||
selected_data_generator = select_data_generator(
|
||||
|
@ -3526,6 +3533,7 @@ async def chat_completion( # noqa: PLR0915
|
|||
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
|
||||
fastest_response_batch_completion=fastest_response_batch_completion,
|
||||
request_data=data,
|
||||
hidden_params=hidden_params,
|
||||
**additional_headers,
|
||||
)
|
||||
)
|
||||
|
@ -3719,6 +3727,7 @@ async def completion( # noqa: PLR0915
|
|||
api_base=api_base,
|
||||
version=version,
|
||||
response_cost=response_cost,
|
||||
hidden_params=hidden_params,
|
||||
request_data=data,
|
||||
)
|
||||
selected_data_generator = select_data_generator(
|
||||
|
@ -3747,6 +3756,7 @@ async def completion( # noqa: PLR0915
|
|||
version=version,
|
||||
response_cost=response_cost,
|
||||
request_data=data,
|
||||
hidden_params=hidden_params,
|
||||
)
|
||||
)
|
||||
await check_response_size_is_safe(response=response)
|
||||
|
@ -3977,6 +3987,7 @@ async def embeddings( # noqa: PLR0915
|
|||
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
|
||||
call_id=litellm_call_id,
|
||||
request_data=data,
|
||||
hidden_params=hidden_params,
|
||||
**additional_headers,
|
||||
)
|
||||
)
|
||||
|
@ -4103,6 +4114,7 @@ async def image_generation(
|
|||
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
|
||||
call_id=litellm_call_id,
|
||||
request_data=data,
|
||||
hidden_params=hidden_params,
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -4223,6 +4235,7 @@ async def audio_speech(
|
|||
fastest_response_batch_completion=None,
|
||||
call_id=litellm_call_id,
|
||||
request_data=data,
|
||||
hidden_params=hidden_params,
|
||||
)
|
||||
|
||||
select_data_generator(
|
||||
|
@ -4362,6 +4375,7 @@ async def audio_transcriptions(
|
|||
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
|
||||
call_id=litellm_call_id,
|
||||
request_data=data,
|
||||
hidden_params=hidden_params,
|
||||
**additional_headers,
|
||||
)
|
||||
)
|
||||
|
@ -4510,6 +4524,7 @@ async def get_assistants(
|
|||
version=version,
|
||||
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
|
||||
request_data=data,
|
||||
hidden_params=hidden_params,
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -4607,6 +4622,7 @@ async def create_assistant(
|
|||
version=version,
|
||||
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
|
||||
request_data=data,
|
||||
hidden_params=hidden_params,
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -4703,6 +4719,7 @@ async def delete_assistant(
|
|||
version=version,
|
||||
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
|
||||
request_data=data,
|
||||
hidden_params=hidden_params,
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -4799,6 +4816,7 @@ async def create_threads(
|
|||
version=version,
|
||||
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
|
||||
request_data=data,
|
||||
hidden_params=hidden_params,
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -4894,6 +4912,7 @@ async def get_thread(
|
|||
version=version,
|
||||
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
|
||||
request_data=data,
|
||||
hidden_params=hidden_params,
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -4992,6 +5011,7 @@ async def add_messages(
|
|||
version=version,
|
||||
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
|
||||
request_data=data,
|
||||
hidden_params=hidden_params,
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -5086,6 +5106,7 @@ async def get_messages(
|
|||
version=version,
|
||||
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
|
||||
request_data=data,
|
||||
hidden_params=hidden_params,
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -5194,6 +5215,7 @@ async def run_thread(
|
|||
version=version,
|
||||
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
|
||||
request_data=data,
|
||||
hidden_params=hidden_params,
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -5316,6 +5338,7 @@ async def moderations(
|
|||
version=version,
|
||||
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
|
||||
request_data=data,
|
||||
hidden_params=hidden_params,
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -5488,6 +5511,7 @@ async def anthropic_response( # noqa: PLR0915
|
|||
version=version,
|
||||
response_cost=response_cost,
|
||||
request_data=data,
|
||||
hidden_params=hidden_params,
|
||||
)
|
||||
)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue