diff --git a/llama_stack/providers/inline/inference/vllm/vllm.py b/llama_stack/providers/inline/inference/vllm/vllm.py index bf95b88a9..dbfb91382 100644 --- a/llama_stack/providers/inline/inference/vllm/vllm.py +++ b/llama_stack/providers/inline/inference/vllm/vllm.py @@ -104,11 +104,27 @@ logger = logging.getLogger(__name__) ############################################################################ # Local functions go here +# For debugging stuff when the Llama Stack logger isn't cooperating +_BYPASS_LOGGING = False + + +def _log(msg: str, level: str): + if _BYPASS_LOGGING: + time_str = datetime.datetime.now().strftime("%H:%M:%S") + print(f"{time_str}: {msg}") + match level: + case "info": + logger.info(msg) + case "debug": + logger.debug(msg) + def _info(msg: str): - time_str = datetime.datetime.now().strftime("%H:%M:%S") - print(f"{time_str}: {msg}") - # logger.info(msg) + _log(msg, "info") + + +def _debug(msg: str): + _log(msg, "debug") def _random_uuid_str() -> str: @@ -265,7 +281,7 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): :returns: The input ``Model`` object. It may or may not be permissible to change fields before returning this object. """ - _info(f"In register_model({model})") + _debug(f"In register_model({model})") # First attempt to interpret the model coordinates as a Llama model name resolved_llama_model = resolve_model(model.provider_model_id) @@ -276,7 +292,7 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): # Not a Llama model name. Pass the model id through to vLLM's loader resolved_model_id = model.provider_model_id - _info(f"Resolved model id: {resolved_model_id}") + _info(f"Model id {model} resolved to {resolved_model_id}") if self.resolved_model_id is not None: if resolved_model_id != self.resolved_model_id: @@ -291,6 +307,8 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): # Model already loaded return model + _info(f"Preloading model: {resolved_model_id}") + # If we get here, this is the first time registering a model. # Preload so that the first inference request won't time out. engine_args = AsyncEngineArgs( @@ -317,8 +335,8 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): # No info -- choose a default so we can at least attempt tool # use. tool_parser = DEFAULT_TOOL_PARSER - _info(f"{hf_config_class_name=}") - _info(f"{tool_parser=}") + _debug(f"{hf_config_class_name=}") + _debug(f"{tool_parser=}") # Wrap the lower-level engine in an OpenAI-compatible chat API model_config = await self.engine.get_model_config() @@ -382,7 +400,7 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): converted_sampling_params = _convert_sampling_params(sampling_params, response_format, logprobs) - _info(f"{converted_sampling_params=}") + _debug(f"{converted_sampling_params=}") if stream: return self._streaming_completion(content, converted_sampling_params) @@ -452,10 +470,6 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): else: raise ValueError(f"Unrecognized stop reason '{stop_reason_str}'") - # _info(f"completion string: {completion_string}") - # _info(f"stop reason: {stop_reason_str}") - # _info(f"completion tokens: {completion_tokens}") - # vLLM's protocol outputs the stop token, then sets end of message # on the next step for some reason. if request_output.outputs[-1].token_ids[-1] == eos_token_id: @@ -515,10 +529,10 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): ) chat_completion_request = vllm.entrypoints.openai.protocol.ChatCompletionRequest(**request_options) - _info(f"Converted request: {chat_completion_request}") + _debug(f"Converted request: {chat_completion_request}") vllm_result = await self.chat.create_chat_completion(chat_completion_request) - _info(f"Result from vLLM: {vllm_result}") + _debug(f"Result from vLLM: {vllm_result}") if isinstance(vllm_result, vllm.entrypoints.openai.protocol.ErrorResponse): raise ValueError(f"Error from vLLM layer: {vllm_result}") @@ -575,7 +589,7 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): # TODO: Convert logprobs - _info(f"Converted message: {converted_message}") + _debug(f"Converted message: {converted_message}") return ChatCompletionResponse( completion_message=converted_message, @@ -629,7 +643,7 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): # Anything that is not "[DONE]" should be a JSON record parsed_chunk = json.loads(data_str) - print(f"Parsed JSON event to:\n{json.dumps(parsed_chunk, indent=2)}") + _debug(f"Parsed JSON event to:\n{json.dumps(parsed_chunk, indent=2)}") # The result may contain multiple completions, but Llama Stack APIs # only support returning one.