mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-12 04:50:39 +00:00
Clean up logging
This commit is contained in:
parent
59211067d1
commit
4302200396
1 changed files with 30 additions and 16 deletions
|
@ -104,11 +104,27 @@ logger = logging.getLogger(__name__)
|
||||||
############################################################################
|
############################################################################
|
||||||
# Local functions go here
|
# Local functions go here
|
||||||
|
|
||||||
|
# For debugging stuff when the Llama Stack logger isn't cooperating
|
||||||
|
_BYPASS_LOGGING = False
|
||||||
|
|
||||||
|
|
||||||
|
def _log(msg: str, level: str):
|
||||||
|
if _BYPASS_LOGGING:
|
||||||
|
time_str = datetime.datetime.now().strftime("%H:%M:%S")
|
||||||
|
print(f"{time_str}: {msg}")
|
||||||
|
match level:
|
||||||
|
case "info":
|
||||||
|
logger.info(msg)
|
||||||
|
case "debug":
|
||||||
|
logger.debug(msg)
|
||||||
|
|
||||||
|
|
||||||
def _info(msg: str):
|
def _info(msg: str):
|
||||||
time_str = datetime.datetime.now().strftime("%H:%M:%S")
|
_log(msg, "info")
|
||||||
print(f"{time_str}: {msg}")
|
|
||||||
# logger.info(msg)
|
|
||||||
|
def _debug(msg: str):
|
||||||
|
_log(msg, "debug")
|
||||||
|
|
||||||
|
|
||||||
def _random_uuid_str() -> str:
|
def _random_uuid_str() -> str:
|
||||||
|
@ -265,7 +281,7 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
|
||||||
:returns: The input ``Model`` object. It may or may not be permissible
|
:returns: The input ``Model`` object. It may or may not be permissible
|
||||||
to change fields before returning this object.
|
to change fields before returning this object.
|
||||||
"""
|
"""
|
||||||
_info(f"In register_model({model})")
|
_debug(f"In register_model({model})")
|
||||||
|
|
||||||
# First attempt to interpret the model coordinates as a Llama model name
|
# First attempt to interpret the model coordinates as a Llama model name
|
||||||
resolved_llama_model = resolve_model(model.provider_model_id)
|
resolved_llama_model = resolve_model(model.provider_model_id)
|
||||||
|
@ -276,7 +292,7 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
|
||||||
# Not a Llama model name. Pass the model id through to vLLM's loader
|
# Not a Llama model name. Pass the model id through to vLLM's loader
|
||||||
resolved_model_id = model.provider_model_id
|
resolved_model_id = model.provider_model_id
|
||||||
|
|
||||||
_info(f"Resolved model id: {resolved_model_id}")
|
_info(f"Model id {model} resolved to {resolved_model_id}")
|
||||||
|
|
||||||
if self.resolved_model_id is not None:
|
if self.resolved_model_id is not None:
|
||||||
if resolved_model_id != self.resolved_model_id:
|
if resolved_model_id != self.resolved_model_id:
|
||||||
|
@ -291,6 +307,8 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
|
||||||
# Model already loaded
|
# Model already loaded
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
_info(f"Preloading model: {resolved_model_id}")
|
||||||
|
|
||||||
# If we get here, this is the first time registering a model.
|
# If we get here, this is the first time registering a model.
|
||||||
# Preload so that the first inference request won't time out.
|
# Preload so that the first inference request won't time out.
|
||||||
engine_args = AsyncEngineArgs(
|
engine_args = AsyncEngineArgs(
|
||||||
|
@ -317,8 +335,8 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
|
||||||
# No info -- choose a default so we can at least attempt tool
|
# No info -- choose a default so we can at least attempt tool
|
||||||
# use.
|
# use.
|
||||||
tool_parser = DEFAULT_TOOL_PARSER
|
tool_parser = DEFAULT_TOOL_PARSER
|
||||||
_info(f"{hf_config_class_name=}")
|
_debug(f"{hf_config_class_name=}")
|
||||||
_info(f"{tool_parser=}")
|
_debug(f"{tool_parser=}")
|
||||||
|
|
||||||
# Wrap the lower-level engine in an OpenAI-compatible chat API
|
# Wrap the lower-level engine in an OpenAI-compatible chat API
|
||||||
model_config = await self.engine.get_model_config()
|
model_config = await self.engine.get_model_config()
|
||||||
|
@ -382,7 +400,7 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
|
||||||
|
|
||||||
converted_sampling_params = _convert_sampling_params(sampling_params, response_format, logprobs)
|
converted_sampling_params = _convert_sampling_params(sampling_params, response_format, logprobs)
|
||||||
|
|
||||||
_info(f"{converted_sampling_params=}")
|
_debug(f"{converted_sampling_params=}")
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
return self._streaming_completion(content, converted_sampling_params)
|
return self._streaming_completion(content, converted_sampling_params)
|
||||||
|
@ -452,10 +470,6 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unrecognized stop reason '{stop_reason_str}'")
|
raise ValueError(f"Unrecognized stop reason '{stop_reason_str}'")
|
||||||
|
|
||||||
# _info(f"completion string: {completion_string}")
|
|
||||||
# _info(f"stop reason: {stop_reason_str}")
|
|
||||||
# _info(f"completion tokens: {completion_tokens}")
|
|
||||||
|
|
||||||
# vLLM's protocol outputs the stop token, then sets end of message
|
# vLLM's protocol outputs the stop token, then sets end of message
|
||||||
# on the next step for some reason.
|
# on the next step for some reason.
|
||||||
if request_output.outputs[-1].token_ids[-1] == eos_token_id:
|
if request_output.outputs[-1].token_ids[-1] == eos_token_id:
|
||||||
|
@ -515,10 +529,10 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
|
||||||
)
|
)
|
||||||
chat_completion_request = vllm.entrypoints.openai.protocol.ChatCompletionRequest(**request_options)
|
chat_completion_request = vllm.entrypoints.openai.protocol.ChatCompletionRequest(**request_options)
|
||||||
|
|
||||||
_info(f"Converted request: {chat_completion_request}")
|
_debug(f"Converted request: {chat_completion_request}")
|
||||||
|
|
||||||
vllm_result = await self.chat.create_chat_completion(chat_completion_request)
|
vllm_result = await self.chat.create_chat_completion(chat_completion_request)
|
||||||
_info(f"Result from vLLM: {vllm_result}")
|
_debug(f"Result from vLLM: {vllm_result}")
|
||||||
if isinstance(vllm_result, vllm.entrypoints.openai.protocol.ErrorResponse):
|
if isinstance(vllm_result, vllm.entrypoints.openai.protocol.ErrorResponse):
|
||||||
raise ValueError(f"Error from vLLM layer: {vllm_result}")
|
raise ValueError(f"Error from vLLM layer: {vllm_result}")
|
||||||
|
|
||||||
|
@ -575,7 +589,7 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
|
||||||
|
|
||||||
# TODO: Convert logprobs
|
# TODO: Convert logprobs
|
||||||
|
|
||||||
_info(f"Converted message: {converted_message}")
|
_debug(f"Converted message: {converted_message}")
|
||||||
|
|
||||||
return ChatCompletionResponse(
|
return ChatCompletionResponse(
|
||||||
completion_message=converted_message,
|
completion_message=converted_message,
|
||||||
|
@ -629,7 +643,7 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
|
||||||
# Anything that is not "[DONE]" should be a JSON record
|
# Anything that is not "[DONE]" should be a JSON record
|
||||||
parsed_chunk = json.loads(data_str)
|
parsed_chunk = json.loads(data_str)
|
||||||
|
|
||||||
print(f"Parsed JSON event to:\n{json.dumps(parsed_chunk, indent=2)}")
|
_debug(f"Parsed JSON event to:\n{json.dumps(parsed_chunk, indent=2)}")
|
||||||
|
|
||||||
# The result may contain multiple completions, but Llama Stack APIs
|
# The result may contain multiple completions, but Llama Stack APIs
|
||||||
# only support returning one.
|
# only support returning one.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue