mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 10:44:24 +00:00
(Feat) Add x-litellm-overhead-duration-ms and "x-litellm-response-duration-ms" in response from LiteLLM (#7899)
* add track_llm_api_timing * add track_llm_api_timing * test_litellm_overhead * use ResponseMetadata class for setting hidden params and response overhead * instrument http handler * fix track_llm_api_timing * track_llm_api_timing * emit response overhead on hidden params * fix resp metadata * fix make_sync_openai_embedding_request * test_aaaaatext_completion_endpoint fixes * _get_value_from_hidden_params * set_hidden_params * test_litellm_overhead * test_litellm_overhead * test_litellm_overhead * fix import * test_litellm_overhead_stream * add LiteLLMLoggingObject * use diff folder for testing * use diff folder for overhead testing * test litellm overhead * use typing * clear typing * test_litellm_overhead * fix async_streaming * update_response_metadata * move test file * pply metadata to the response objec
This commit is contained in:
parent
63d7d04232
commit
b6f2e659b9
17 changed files with 464 additions and 73 deletions
111
litellm/utils.py
111
litellm/utils.py
|
@ -93,6 +93,9 @@ from litellm.litellm_core_utils.llm_response_utils.get_formatted_prompt import (
|
|||
from litellm.litellm_core_utils.llm_response_utils.get_headers import (
|
||||
get_response_headers,
|
||||
)
|
||||
from litellm.litellm_core_utils.llm_response_utils.response_metadata import (
|
||||
ResponseMetadata,
|
||||
)
|
||||
from litellm.litellm_core_utils.redact_messages import (
|
||||
LiteLLMLoggingObject,
|
||||
redact_message_input_output_from_logging,
|
||||
|
@ -929,6 +932,15 @@ def client(original_function): # noqa: PLR0915
|
|||
chunks, messages=kwargs.get("messages", None)
|
||||
)
|
||||
else:
|
||||
# RETURN RESULT
|
||||
update_response_metadata(
|
||||
result=result,
|
||||
logging_obj=logging_obj,
|
||||
model=model,
|
||||
kwargs=kwargs,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
return result
|
||||
elif "acompletion" in kwargs and kwargs["acompletion"] is True:
|
||||
return result
|
||||
|
@ -966,25 +978,14 @@ def client(original_function): # noqa: PLR0915
|
|||
end_time,
|
||||
)
|
||||
# RETURN RESULT
|
||||
if hasattr(result, "_hidden_params"):
|
||||
result._hidden_params["model_id"] = kwargs.get("model_info", {}).get(
|
||||
"id", None
|
||||
)
|
||||
result._hidden_params["api_base"] = get_api_base(
|
||||
model=model or "",
|
||||
optional_params=getattr(logging_obj, "optional_params", {}),
|
||||
)
|
||||
result._hidden_params["response_cost"] = (
|
||||
logging_obj._response_cost_calculator(result=result)
|
||||
)
|
||||
|
||||
result._hidden_params["additional_headers"] = process_response_headers(
|
||||
result._hidden_params.get("additional_headers") or {}
|
||||
) # GUARANTEE OPENAI HEADERS IN RESPONSE
|
||||
if result is not None:
|
||||
result._response_ms = (
|
||||
end_time - start_time
|
||||
).total_seconds() * 1000 # return response latency in ms like openai
|
||||
update_response_metadata(
|
||||
result=result,
|
||||
logging_obj=logging_obj,
|
||||
model=model,
|
||||
kwargs=kwargs,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
return result
|
||||
except Exception as e:
|
||||
call_type = original_function.__name__
|
||||
|
@ -1116,39 +1117,17 @@ def client(original_function): # noqa: PLR0915
|
|||
chunks, messages=kwargs.get("messages", None)
|
||||
)
|
||||
else:
|
||||
update_response_metadata(
|
||||
result=result,
|
||||
logging_obj=logging_obj,
|
||||
model=model,
|
||||
kwargs=kwargs,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
return result
|
||||
elif call_type == CallTypes.arealtime.value:
|
||||
return result
|
||||
|
||||
# ADD HIDDEN PARAMS - additional call metadata
|
||||
if hasattr(result, "_hidden_params"):
|
||||
result._hidden_params["litellm_call_id"] = getattr(
|
||||
logging_obj, "litellm_call_id", None
|
||||
)
|
||||
result._hidden_params["model_id"] = kwargs.get("model_info", {}).get(
|
||||
"id", None
|
||||
)
|
||||
result._hidden_params["api_base"] = get_api_base(
|
||||
model=model or "",
|
||||
optional_params=kwargs,
|
||||
)
|
||||
result._hidden_params["response_cost"] = (
|
||||
logging_obj._response_cost_calculator(result=result)
|
||||
)
|
||||
result._hidden_params["additional_headers"] = process_response_headers(
|
||||
result._hidden_params.get("additional_headers") or {}
|
||||
) # GUARANTEE OPENAI HEADERS IN RESPONSE
|
||||
if (
|
||||
isinstance(result, ModelResponse)
|
||||
or isinstance(result, EmbeddingResponse)
|
||||
or isinstance(result, TranscriptionResponse)
|
||||
):
|
||||
setattr(
|
||||
result,
|
||||
"_response_ms",
|
||||
(end_time - start_time).total_seconds() * 1000,
|
||||
) # return response latency in ms like openai
|
||||
|
||||
### POST-CALL RULES ###
|
||||
post_call_processing(
|
||||
original_response=result, model=model, optional_params=kwargs
|
||||
|
@ -1190,6 +1169,15 @@ def client(original_function): # noqa: PLR0915
|
|||
end_time=end_time,
|
||||
)
|
||||
|
||||
update_response_metadata(
|
||||
result=result,
|
||||
logging_obj=logging_obj,
|
||||
model=model,
|
||||
kwargs=kwargs,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
|
||||
return result
|
||||
except Exception as e:
|
||||
traceback_exception = traceback.format_exc()
|
||||
|
@ -1293,6 +1281,31 @@ def _is_async_request(
|
|||
return False
|
||||
|
||||
|
||||
def update_response_metadata(
|
||||
result: Any,
|
||||
logging_obj: LiteLLMLoggingObject,
|
||||
model: Optional[str],
|
||||
kwargs: dict,
|
||||
start_time: datetime.datetime,
|
||||
end_time: datetime.datetime,
|
||||
) -> None:
|
||||
"""
|
||||
Updates response metadata, adds the following:
|
||||
- response._hidden_params
|
||||
- response._hidden_params["litellm_overhead_time_ms"]
|
||||
- response.response_time_ms
|
||||
"""
|
||||
if result is None:
|
||||
return
|
||||
|
||||
metadata = ResponseMetadata(result)
|
||||
metadata.set_hidden_params(logging_obj=logging_obj, model=model, kwargs=kwargs)
|
||||
metadata.set_timing_metrics(
|
||||
start_time=start_time, end_time=end_time, logging_obj=logging_obj
|
||||
)
|
||||
metadata.apply()
|
||||
|
||||
|
||||
def _select_tokenizer(
|
||||
model: str, custom_tokenizer: Optional[CustomHuggingfaceTokenizer] = None
|
||||
):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue