diff --git a/llama_stack/models/llama/llama3/chat_format.py b/llama_stack/models/llama/llama3/chat_format.py index f55cd5e1c..fe7a7a898 100644 --- a/llama_stack/models/llama/llama3/chat_format.py +++ b/llama_stack/models/llama/llama3/chat_format.py @@ -226,7 +226,6 @@ class ChatFormat: arguments_json=json.dumps(tool_arguments), ) ) - content = "" return RawMessage( role="assistant", diff --git a/llama_stack/models/llama/llama4/chat_format.py b/llama_stack/models/llama/llama4/chat_format.py index 160bb00f8..9d60d00e9 100644 --- a/llama_stack/models/llama/llama4/chat_format.py +++ b/llama_stack/models/llama/llama4/chat_format.py @@ -301,7 +301,6 @@ class ChatFormat: arguments=tool_arguments, ) ) - content = "" return RawMessage( role="assistant", diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py index da5ded0f3..0b56ba1f7 100644 --- a/llama_stack/providers/inline/inference/meta_reference/inference.py +++ b/llama_stack/providers/inline/inference/meta_reference/inference.py @@ -452,7 +452,7 @@ class MetaReferenceInferenceImpl( for token_results in self.generator.chat_completion(request_batch): first = token_results[0] - if not first.finished: + if not first.finished and not first.ignore_token: if os.environ.get("LLAMA_MODELS_DEBUG", "0") in ("1", "2"): cprint(first.text, "cyan", end="") if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "2":