From 43998e4348f81f1365f0392921d8d7e3c397bf3d Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Tue, 18 Feb 2025 16:51:47 -0800 Subject: [PATCH] Add explanatory comment --- llama_stack/providers/inline/inference/vllm/vllm.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/llama_stack/providers/inline/inference/vllm/vllm.py b/llama_stack/providers/inline/inference/vllm/vllm.py index 8798c0d5e..2c6bcbf82 100644 --- a/llama_stack/providers/inline/inference/vllm/vllm.py +++ b/llama_stack/providers/inline/inference/vllm/vllm.py @@ -503,8 +503,13 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): results_generator = self.engine.generate(content, sampling_params, request_id) # Need to know the model's EOS token ID for the conversion code below. - # This information is buried pretty deeply. - eos_token_id = self.engine.engine.tokenizer.tokenizer.eos_token_id + # AsyncLLMEngine is a wrapper around LLMEngine, and the tokenizer is only available if + # we drill down to the LLMEngine inside the AsyncLLMEngine. + # Similarly, the tokenizer in an LLMEngine is a wrapper around a BaseTokenizerGroup, + # and we need to drill down to the Hugging Face tokenizer inside the BaseTokenizerGroup. + llm_engine = self.engine.engine + tokenizer_group = llm_engine.tokenizer + eos_token_id = tokenizer_group.tokenizer.eos_token_id request_output: vllm.RequestOutput = None async for request_output in results_generator: