diff --git a/llama_stack/models/llama/llama4/chat_format.py b/llama_stack/models/llama/llama4/chat_format.py
index 75d7cbc0e..c873012d6 100644
--- a/llama_stack/models/llama/llama4/chat_format.py
+++ b/llama_stack/models/llama/llama4/chat_format.py
@@ -203,14 +203,8 @@ class ChatFormat:
             tokens.extend(toks)
             images.extend(imgs)
 
-        # if message.role == "assistant" and len(message.tool_calls) > 0:
-        #     tokens.append(self.tokenizer.special_tokens["<|python_start|>"])
-
         _process_content(message.content)
 
-        # if message.role == "assistant" and len(message.tool_calls) > 0:
-        #     tokens.append(self.tokenizer.special_tokens["<|python_end|>"])
-
         if message.role == "user" and message.context is not None:
             # This is RAG context; why is it here in the chat format? I don't think
             # this is needed and can be moved upwards
@@ -222,6 +216,7 @@ class ChatFormat:
                 content = ToolUtils.encode_tool_call(t, tool_prompt_format)
                 _process_content(content)
 
+        # Tool calls and Tool Response messages should be eom
         eom = False
         if message.role == "assistant":
             eom = message.stop_reason == StopReason.end_of_message or message.tool_calls
diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py
index 029314d88..da217728b 100644
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@@ -6,8 +6,11 @@
 
 import asyncio
 import logging
+import os
 from typing import AsyncGenerator, List, Optional, Union
 
+from termcolor import cprint
+
 from llama_stack.apis.common.content_types import (
     TextDelta,
     ToolCallDelta,
@@ -338,9 +341,8 @@ class MetaReferenceInferenceImpl(
             stop_reason = None
 
             for token_result in self.generator.chat_completion(request):
-                from termcolor import cprint
-
-                cprint(token_result.text, "cyan", end="")
+                if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "1":
+                    cprint(token_result.text, "cyan", end="")
 
                 tokens.append(token_result.token)
 
@@ -390,9 +392,8 @@ class MetaReferenceInferenceImpl(
             ipython = False
 
             for token_result in self.generator.chat_completion(request):
-                from termcolor import cprint
-
-                cprint(token_result.text, "cyan", end="")
+                if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "1":
+                    cprint(token_result.text, "cyan", end="")
 
                 tokens.append(token_result.token)
 
diff --git a/tests/integration/test_cases/inference/chat_completion.json b/tests/integration/test_cases/inference/chat_completion.json
index f842bca9a..c84d29e64 100644
--- a/tests/integration/test_cases/inference/chat_completion.json
+++ b/tests/integration/test_cases/inference/chat_completion.json
@@ -128,7 +128,7 @@
       ],
       "tool_responses": [
         {
-          "response": "{'resposne': '70 degrees and foggy'}"
+          "response": "{'response': '70 degrees and foggy'}"
         }
       ],
       "expected": [
@@ -174,7 +174,7 @@
       ],
       "tool_responses": [
         {
-          "response": "{'resposne': '70 degrees and foggy'}"
+          "response": "{'response': '70 degrees and foggy'}"
         }
       ],
       "expected": [
@@ -398,7 +398,7 @@
           "response": "{'response': 'Total expenses for January 2025: $1000'}"
         },
         {
-          "response": "{'resposne': 'Total expenses for February 2024: $2000'}"
+          "response": "{'response': 'Total expenses for February 2024: $2000'}"
         }
       ],
       "expected": [