From 43998e4348f81f1365f0392921d8d7e3c397bf3d Mon Sep 17 00:00:00 2001
From: Fred Reiss <frreiss@us.ibm.com>
Date: Tue, 18 Feb 2025 16:51:47 -0800
Subject: [PATCH] Add explanatory comment

---
 llama_stack/providers/inline/inference/vllm/vllm.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/llama_stack/providers/inline/inference/vllm/vllm.py b/llama_stack/providers/inline/inference/vllm/vllm.py
index 8798c0d5e..2c6bcbf82 100644
--- a/llama_stack/providers/inline/inference/vllm/vllm.py
+++ b/llama_stack/providers/inline/inference/vllm/vllm.py
@@ -503,8 +503,13 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
         results_generator = self.engine.generate(content, sampling_params, request_id)
 
         # Need to know the model's EOS token ID for the conversion code below.
-        # This information is buried pretty deeply.
-        eos_token_id = self.engine.engine.tokenizer.tokenizer.eos_token_id
+        # AsyncLLMEngine is a wrapper around LLMEngine, and the tokenizer is only available if
+        # we drill down to the LLMEngine inside the AsyncLLMEngine.
+        # Similarly, the tokenizer in an LLMEngine is a wrapper around a BaseTokenizerGroup,
+        # and we need to drill down to the Hugging Face tokenizer inside the BaseTokenizerGroup.
+        llm_engine = self.engine.engine
+        tokenizer_group = llm_engine.tokenizer
+        eos_token_id = tokenizer_group.tokenizer.eos_token_id
 
         request_output: vllm.RequestOutput = None
         async for request_output in results_generator: